/*
  Copyright (C) 2006 Helge Hess

  This file is part of JOPE.

  JOPE is free software; you can redistribute it and/or modify it under
  the terms of the GNU Lesser General Public License as published by the
  Free Software Foundation; either version 2, or (at your option) any
  later version.

  JOPE is distributed in the hope that it will be useful, but WITHOUT ANY
  WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  License for more details.

  You should have received a copy of the GNU Lesser General Public
  License along with JOPE; see the file COPYING.  If not, write to the
  Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  02111-1307, USA.
*/

package org.opengroupware.jope.appserver.templates;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.opengroupware.jope.appserver.core.WOElement;
import org.opengroupware.jope.appserver.elements.WOStaticHTMLElement;
import org.opengroupware.jope.foundation.NSPropertyListParser;

// TODO: complete me
/*
 * WOHTMLParser
 *
 * This parser parses "old-style" .wo templates. It does *not* process the
 * whole HTML of the file, it only searches for text sections which start
 * with "<#". That way you can process "illegal" HTML code, eg:
 *
 *   <a href="<#MyLink" />"> ...
 *
 * So the syntax is:
 *   <#wod-name>...</#wod-name>
 * 
 * Internals
 * 
 *  The root parse function is _parseElement() which calls either 
 *  _parseWOElement() or _parseHashElement() if it finds a NGObjWeb tag at the 
 *  beginning of the buffer. 
 *  If it doesn't it collects all content till it encounteres an NGObjWeb tag, 
 *  and reports that content as "static text" to the callback.
 *  
 *  Parsing a dynamic element is:
 *    - parse the start tag
 *    - parse the attributes
 *    - parse the contents, static strings and elements
 *      - add content to a children array
 *    - produce WOElement by calling
 *      -dynamicElementWithName:attributes:contentElements:
 *    - parse close tag
 * 
 * Note: this is a straight port of the ObjC parser and therefore somewhat
 *       clumsy.
 */
public class WOHTMLParser {
  
  protected final Log log = LogFactory.getLog("WOTemplates");

  protected WOHTMLParserHandler handler = null;
  protected Exception lastException = null;
  protected int    idx    = -1;
  protected int    len    = -1;
  protected char[] buffer = null;
  
  protected static final boolean debugOn = false;
  
  /* do process markers inside HTML tags ? */
  protected static boolean skipPlainTags = false;
  protected static boolean compressHTMLWhitespace = true;
  
  /* accessors */
  
  public void setHandler(WOHTMLParserHandler _handler) {
    this.handler = _handler;
  }
  
  /* top-level parsing */
  
  public List<WOElement> parseHTMLData(char[] _data) {
    if (!this.handler.willParseHTMLData(this, _data))
      return null;
    if (_data == null)
      return null;
    
    /* reset state */

    this.lastException = null;
    this.buffer = _data;
    this.idx    = 0;
    this.len    = this.buffer.length;
    
    /* start parsing */
    
    List<WOElement> topLevel = new ArrayList<WOElement>(16);
    while ((this.idx < this.len) && (this.lastException == null)) {
      int lastIdx = this.idx;
      WOElement element = this._parseElement();
      if (element == null) {
        if (this.idx == lastIdx) {
          this.log.error("parseElement didn't parse anything at: " + this.idx);
          break;
        }
        continue;
      }
      
      topLevel.add(element);
    }
    
    /* notify handler of result */
    
    if (this.lastException != null)
      this.handler.failedParsingHTMLData(this, _data, this.lastException);
    else
      this.handler.finishedParsingHTMLData(this, _data, topLevel);
    
    /* reset temporary state */
    
    this.buffer = null;
    this.idx    = -1;
    this.len    = -1;
    
    return this.lastException != null ? null : topLevel;
  }
  
  public List<WOElement> parseHTMLData(String _data) {
    if (_data == null)
      return null;
    return this.parseHTMLData(_data.toCharArray());
  }
  
  public List<WOElement> parseHTMLData(byte[] _buf) {
    if (_buf == null)
      return null;
    
    // TODO: check prefix for encoding
    try {
      return this.parseHTMLData(new String(_buf, "utf8"));
    }
    catch (UnsupportedEncodingException e) {
      this.log.error("failed to transform byte array to UTF-8", e);
      return null;
    }
  }

  public List<WOElement> parseHTMLData(InputStream _in) {
    if (_in == null)
      return null;
    
    return this.parseHTMLData(NSPropertyListParser.loadContentFromStream(_in));
  }
  
  public List<WOElement> parseHTMLData(URL _url) {
    if (_url == null)
      return null;
    
    try {
      return parseHTMLData(_url.openStream());
    }
    catch (IOException e) {
      this.log.error("could not read from URL: " + _url, e);
    }
    return null;
  }
  
  /* error handling */
  
  public Exception lastException() {
    return this.lastException;
  }
  public void resetLastException() {
    this.lastException = null;
  }
  
  protected void addException(String _error) {
    // TODO: keep old exceptions?
    // TODO: SOPE has some elaborate error tracking which we might want to add
    this.log.debug("exception: " + _error);
    this.lastException = new Exception(_error);
  }
  
  /* parsing */
  
  protected WOElement _parseElement() {
    boolean isDebugOn = this.log.isDebugEnabled();
    
    if (this.idx >= this.len) /* EOF */
      return null;
    
    if (this._isHashTag()) {
      /* start parsing of dynamic content */
      if (isDebugOn) this.log.debug("detected hash element ...");
      return this._parseHashElement();
    }
    if (this._isHashCloseTag()) {
      this.log.warn("unexpected hash close tag (</#...>)");
      // TODO: in SOPE we raise an exception
    }

    if (this._isWOTag()) {
    /* start parsing of dynamic content */
      if (isDebugOn) this.log.debug("detected WO element ...");
      return this._parseWOElement();
    }
    if (this._isWOCloseTag()) {
      this.log.warn("unexpected WEBOBJECT close tag (</WEBOBJECT>)");
      // TODO: in SOPE we raise an exception
    }

    /* parse text/tag content */
    int startPos = this.idx;
    while (this.idx < this.len) {
      /* scan until we find a tag marker '<' */
      while ((this.idx < this.len) && (this.buffer[this.idx] != '<'))
        this.idx++;
      
      if (this.idx >= this.len) /* EOF was reached */
        break;
      
      /* check whether its a tag which we parse */

      if (_isHashTag()) /* found Hash */
        break;
      if (_isHashCloseTag())
        break;
      if (_isWOTag()) /* found Hash */
        break;
      if (_isWOCloseTag())
        break;

      if (this._isComment()) {
        if (isDebugOn) this.log.debug("detected comment ...");
        this.idx += 3; // skip '<--'

        while (this.idx < this.len) {
          if (this.buffer[this.idx] == '-') {
            if (this.idx + 2 < this.len) {
              if ((this.buffer[this.idx + 1] == '-') && 
                  (this.buffer[this.idx + 2] == '>')) {
                // found '-->'
                this.idx += 3; // skip '-->'
                break;
              }
            }
          }
          this.idx++;
        }
        if (this.idx >= this.len) // EOF was reached
          break;
      }
      else {
        if (isDebugOn) this.log.debug("read regular tag ...");
        
        // skip '<', read usual tag
        this.idx++;
        if (this.idx >= this.len) { // EOF was reached with opening '<'
          this.log.warn("reached EOF with '<' at end !");
          break;
        }
      
        if (skipPlainTags) {
          /* skip until end of HTML tag (not #-tag) */
          do {
            this.idx++;
          }
          while ((this.buffer[this.idx] != '>') && (this.idx < this.len));
          if (this.idx >= this.len) break; // EOF
        }
      
        this.idx++;
      }
    }
    
    if (this.idx - startPos <= 0) {
      if (isDebugOn) this.log.debug("static string element w/o content ...");
      return null;
    }
    
    String s = new String(this.buffer, startPos, this.idx - startPos);
    if (isDebugOn) this.log.debug("static string, length: " + s.length());
    return new WOStaticHTMLElement(s);
  }
  
  protected void _skipSpaces() {
    int pos = this.idx;
    
    if (pos >= this.len) return; /* EOF */
    
    while ((pos < this.len) && _isHTMLSpace(this.buffer[pos]))
      pos++;
    
    this.idx = pos;
  }
  
  protected WOElement _parseHashElement() {
    boolean isDebugOn = this.log.isDebugEnabled();
    
    if (this.idx >= this.len) return null; /* EOF */
    
    if (!this._isHashTag())
      return null; /* not a hash tag */
    
    if (isDebugOn) this.log.debug("parse hash element ...");
    
    this.idx += 2; /* skip '<#' */
    boolean hadSlashAfterHash = this.buffer[this.idx] == '/';

    if (hadSlashAfterHash) {
      /* a tag starting like this: "<#/", probably an typo */
      this.log.error("typo in hash close tag ('<#/' => '</#').");
    }
    
    /* parse tag name */
    
    String name;
    if ((name = this._parseStringValue()) == null) {
      if (this.lastException != null) // if there was an error ..
        return null;
    }
    this._skipSpaces();
    
    /* parse attributes */
    
    if (isDebugOn) this.log.debug("  parse hash attributes ...");
    Map<String,String> attrs = this._parseTagAttributes();
    if (this.lastException != null)
      return null; // invalid tag attrs
    
    
    if (this.idx >= this.len) {
      this.addException("unexpected EOF: missing '>' in hash tag (EOF).");
      return null; // unexpected EOF
    }
    
    /* parse tag end (> or /) */
    if (this.buffer[this.idx] != '>' && this.buffer[this.idx] != '/') {
      this.addException("missing '>' in hash element tag.");
      return null; // unexpected EOF
    }
    
    boolean isAutoClose = false;
    boolean foundEndTag = false;
    List<WOElement> children = null;
    
    if (this.buffer[this.idx] == '>') { /* hashtag is closed */
      /* has sub-elements (<#name>...</#name>) */
      this.idx += 1; // skip '>'
      
      if (isDebugOn) this.log.debug("  parsing hash children ...");
    
      while ((this.idx < this.len) && (this.lastException == null)) {
        WOElement subElement = null;
        
        if (this._isHashCloseTag()) {
          foundEndTag = true;
          break;
        }
  
        subElement = this._parseElement();
      
        if (subElement != null) {
          if (children == null)
            children = new ArrayList<WOElement>(16);
          children.add(subElement);
        }
      }
    }
    else { /* is an empty tag (<#name/>) */
      /* has no sub-elements (<#name/>) */
      if (isDebugOn) this.log.debug("  is autoclose hash-tag ...");
      this.idx += 1; // skip '/'
      isAutoClose = true;
      if (this.buffer[this.idx] != '>') {
        this.addException("missing '>' in hash element tag.");
        return null; // unexpected EOF
      }
      this.idx += 1; // skip '>'
    }
    
    /* produce elements */
  
    if (name.length() < 1) {
      this.addException("missing name in hash element tag.");
      return null;
    }
    
    Map<String, String> nameDict = new HashMap<String,String>(1);
    nameDict.put("NAME", name);
    if (attrs != null)
      attrs.putAll(nameDict);
    
    WOElement element = this.handler.dynamicElementWithName
      (name, (attrs != null ? attrs : nameDict), children);
    nameDict = null;
    if (isDebugOn) this.log.debug("  hash element: " + element);
    
    if (element == null) { // build error
      this.addException("could not build hash element: " + name);
      return null;
    }
    
    if (!foundEndTag && !isAutoClose) {
      this.addException("did not find hash end tag (</#" + name + ">) ..");
      return null;
    }
    else if (!isAutoClose) {
      /* skip close tag ('</#name>') */
      if (!this._isHashCloseTag())
        this.log.error("invalid parser state ..");
      
      this.idx += 3; // skip '</#'
      while ((this.idx < this.len) && (this.buffer[this.idx] != '>'))
        this.idx += 1;
      this.idx += 1; // skip '>'
    }
    return element;
  }
  
  protected WOElement _parseWOElement() {
    boolean isDebugOn = this.log.isDebugEnabled();
    
    if (this.idx >= this.len) return null; /* EOF */
    
    if (!this._isWOTag())
      return null; /* not a WEBOBJECT tag */
    
    if (isDebugOn) this.log.debug("parse WEBOBJECT element ...");
    
    this.idx += 10; /* skip '<WEBOBJECT' */
    
    /* parse attributes */
    
    if (isDebugOn) this.log.debug("  parse WEBOBJECT attributes ...");
    Map<String,String> attrs = this._parseTagAttributes();
    if (this.lastException != null || attrs == null)
      return null; // invalid tag attrs    
    
    String name = null;
    if ((name = attrs.get("NAME")) == null)
      name = attrs.get("name");
    if (name == null) {
      this.addException("missing name in WEBOBJECT element tag.");
      return null;
    }
    if (name.length() < 1) {
      this.addException("missing name in WEBOBJECT element tag.");
      return null;
    }
        
    if (this.idx >= this.len) {
      this.addException("unexpected EOF: missing '>' in WEBOBJECT tag (EOF).");
      return null; // unexpected EOF
    }
    
    /* parse tag end '>' */
    if (this.buffer[this.idx] != '>') {
      this.addException("missing '>' in WEBOBJECT element tag.");
      return null; // unexpected EOF
    }
    this.idx += 1; // skip '>'
    
    boolean foundEndTag = false;
    List<WOElement> children = null;
          
    if (isDebugOn) this.log.debug("  parsing WEBOBJECT children ...");
    
    while ((this.idx < this.len) && (this.lastException == null)) {
      WOElement subElement = null;
      
      if (this._isWOCloseTag()) {
        foundEndTag = true;
        break;
      }

      subElement = this._parseElement();
    
      if (subElement != null) {
        if (children == null)
          children = new ArrayList<WOElement>(16);
        children.add(subElement);
      }
    }
    
    /* produce elements */
  
    WOElement element =
      this.handler.dynamicElementWithName(name, attrs, children);
    if (isDebugOn) this.log.debug("  WEBOBJECT element: " + element);
    
    if (element == null) { // build error
      this.addException("could not build WEBOBJECT element !.");
      return null;
    }
    
    if (!foundEndTag) {
      this.addException("did not find WEBOBJECT end tag (</WEBOBJECT>) ..");
      return null;
    }

    /* skip close tag ('</WEBOBJECT>') */
    if (!this._isWOCloseTag())
      this.log.error("invalid parser state ..");
      
    this.idx += 11; // skip '</WEBOBJECT'
    while ((this.idx < this.len) && (this.buffer[this.idx] != '>'))
      this.idx += 1;
    this.idx += 1; // skip '>'
    
    return element;
  }
  
  protected Map<String,String> _parseTagAttributes() {
    this._skipSpaces();    
    if (this.idx >= this.len) return null; /* EOF */
    
    Map<String,String> attrs = null;
    do {
      this._skipSpaces();    
      if (this.idx >= this.len) break; /* EOF */
      
      /* read key */
      
      String key = this._parseStringValue();
      if (key == null) /* ended */
        break;
      
      /* The following parses:  space* '=' space* */

      this._skipSpaces();    
      if (this.idx >= this.len) { /* EOF */
        this.addException("expected '=' after key in attributes ..");
        break; /* unexpected EOF */
      }
      
      if (this.buffer[this.idx] != '=') {
        this.addException("expected '=' after key in attributes ..");
        break;
      }
      this.idx++; /* skip '=' */
      
      this._skipSpaces();    
      if (this.idx >= this.len) { /* EOF */
        this.addException("expected value after key in attributes ..");
        break; /* unexpected EOF */
      }
      
      /* read value */
      
      String value = this._parseStringValue();
      if (value == null) {
        this.addException("expected value after key in attributes ..");
        break; /* unexpected EOF */
      }
      
      /* add to Map */
      
      if (attrs == null)
        attrs = new HashMap<String,String>(2);
      attrs.put(key, value);
    }
    while (this.idx < this.len);
    
    return attrs;
  }
  
  protected String _parseStringValue() {
    this._skipSpaces();
    
    int pos = this.idx;
    if (pos >= this.len) return null; /* EOF */
    
    char c = this.buffer[pos];
    if (c == '>' || c == '/' || c == '=') return null;
    
    if (this.buffer[pos] == '"') { /* quoted string */    
      pos++; /* skip starting quote ('"') */
      int ilen = 0;
      int startPos = pos;
      
      /* loop until closing quote */
      while ((pos < this.len) && (this.buffer[pos] != '"')) {
        pos++;
        ilen++;
      }
      
      if (pos == this.len) { /* syntax error, quote not closed */
        this.idx = pos;
        this.addException("quoted string not closed (expected '\"')");
        return null;
      }
      
      pos++;          /* skip closing quote */
      this.idx = pos; /* store pointer      */
      
      if (ilen == 0)   /* empty string */
        return "";
      
      return new String(this.buffer, startPos, ilen);
    }
    
    /* string without quotes */
    
    int startPos = pos;
    if (pos >= this.len) return null; /* EOF */
    
    /* loop until '>' or '=' or '/' or space */
    c = this.buffer[pos];
    while ((c != '>' && c != '=' && c != '/') && !_isHTMLSpace(c)) {
      pos++;
      if (pos >= this.len) break;
      c = this.buffer[pos];
    }
    this.idx = pos;
    
    if ((pos - startPos) == 0) /* wasn't a string .. */
      return null;
    
    return new String(this.buffer, startPos, pos - startPos);
  }
  
  /* lookahead */
  
  protected boolean _isComment() {
    /* checks whether a comment is upcoming (<!--), doesn't consume */
    if ((this.idx + 7) >= this.len) /* check whether it is long enough */
      return false;
    if (this.buffer[this.idx] != '<') /* check whether it is a tag */
      return false;
    
    if (this.buffer[this.idx + 1] != '!') return false;
    if (this.buffer[this.idx + 2] != '-') return false;
    if (this.buffer[this.idx + 3] != '-') return false;
    return true;
  }
  
  protected boolean _isHashTag() {
    /* check for <#.> (len 4) */
    if ((this.idx + 4) >= this.len) /* check whether it is long enough */
      return false;
    if (this.buffer[this.idx] != '<') /* check whether it is a tag */
      return false;

    return (this.buffer[this.idx + 1] == '#') ? true : false;
  }

  protected boolean _isHashCloseTag() {
    /* check for <#.> (len 5) */
    if ((this.idx + 5) >= this.len) /* check whether it is long enough */
      return false;
    if (this.buffer[this.idx] != '<' && this.buffer[this.idx + 1] != '/')
      return false; /* not a close tag */
    
    return (this.buffer[this.idx + 2] == '#') ? true : false;
  }
  
  protected boolean _isWOTag() {
    /* check for "<WEBOBJECT .......>" (len 19) (lowercase is allowed) */
    if ((this.idx + 18) >= this.len) /* check whether it is long enough */
      return false;
    if (this.buffer[this.idx] != '<') /* check whether it is a tag */
      return false;

    return _ucIsCaseEqual(this.buffer, this.idx, "<WEBOBJECT");
  }

  protected boolean _isWOCloseTag() {
    /* check for </WEBOBJECT> (len=12) */
    if ((this.idx + 12) >= this.len) /* check whether it is long enough */
      return false;
    if (this.buffer[this.idx] != '<' && this.buffer[this.idx + 1] != '/')
      return false; /* not a close tag */
    
    return _ucIsCaseEqual(this.buffer, this.idx, "</WEBOBJECT>");
  }

  protected static boolean _ucIsCaseEqual(char[] _buf, int _pos, String _s) {
    int len = _s.length();
    if (_buf.length <= _pos + len)
      return false;
    for (int i = 0; i < len; i++) {
      // TODO: remove case sensitivity
      if (_buf[_pos + i] != _s.charAt(i))
        return false;
    }
    return true;
  }
  
  /* charset classification */
  
  public static boolean _isHTMLSpace(char _c) {
    switch (_c) {
      case ' ': case '\t': case '\r': case '\n':
        return true;
      default:
        return false;
    }
  }
}
