/* Copyright (C) 2006 Helge Hess This file is part of JOPE. JOPE is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. JOPE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with JOPE; see the file COPYING. If not, write to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package org.opengroupware.jope.appserver.templates; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.opengroupware.jope.appserver.core.WOElement; import org.opengroupware.jope.appserver.elements.WOStaticHTMLElement; import org.opengroupware.jope.foundation.NSPropertyListParser; // TODO: complete me /* * WOHTMLParser * * This parser parses "old-style" .wo templates. It does *not* process the * whole HTML of the file, it only searches for text sections which start * with "<#". That way you can process "illegal" HTML code, eg: * * "> ... * * So the syntax is: * <#wod-name>... * * Internals * * The root parse function is _parseElement() which calls either * _parseWOElement() or _parseHashElement() if it finds a NGObjWeb tag at the * beginning of the buffer. * If it doesn't it collects all content till it encounteres an NGObjWeb tag, * and reports that content as "static text" to the callback. * * Parsing a dynamic element is: * - parse the start tag * - parse the attributes * - parse the contents, static strings and elements * - add content to a children array * - produce WOElement by calling * -dynamicElementWithName:attributes:contentElements: * - parse close tag * * Note: this is a straight port of the ObjC parser and therefore somewhat * clumsy. */ public class WOHTMLParser { protected final Log log = LogFactory.getLog("WOTemplates"); protected WOHTMLParserHandler handler = null; protected Exception lastException = null; protected int idx = -1; protected int len = -1; protected char[] buffer = null; protected static final boolean debugOn = false; /* do process markers inside HTML tags ? */ protected static boolean skipPlainTags = false; protected static boolean compressHTMLWhitespace = true; /* accessors */ public void setHandler(WOHTMLParserHandler _handler) { this.handler = _handler; } /* top-level parsing */ public List parseHTMLData(char[] _data) { if (!this.handler.willParseHTMLData(this, _data)) return null; if (_data == null) return null; /* reset state */ this.lastException = null; this.buffer = _data; this.idx = 0; this.len = this.buffer.length; /* start parsing */ List topLevel = new ArrayList(16); while ((this.idx < this.len) && (this.lastException == null)) { int lastIdx = this.idx; WOElement element = this._parseElement(); if (element == null) { if (this.idx == lastIdx) { this.log.error("parseElement didn't parse anything at: " + this.idx); break; } continue; } topLevel.add(element); } /* notify handler of result */ if (this.lastException != null) this.handler.failedParsingHTMLData(this, _data, this.lastException); else this.handler.finishedParsingHTMLData(this, _data, topLevel); /* reset temporary state */ this.buffer = null; this.idx = -1; this.len = -1; return this.lastException != null ? null : topLevel; } public List parseHTMLData(String _data) { if (_data == null) return null; return this.parseHTMLData(_data.toCharArray()); } public List parseHTMLData(byte[] _buf) { if (_buf == null) return null; // TODO: check prefix for encoding try { return this.parseHTMLData(new String(_buf, "utf8")); } catch (UnsupportedEncodingException e) { this.log.error("failed to transform byte array to UTF-8", e); return null; } } public List parseHTMLData(InputStream _in) { if (_in == null) return null; return this.parseHTMLData(NSPropertyListParser.loadContentFromStream(_in)); } public List parseHTMLData(URL _url) { if (_url == null) return null; try { return parseHTMLData(_url.openStream()); } catch (IOException e) { this.log.error("could not read from URL: " + _url, e); } return null; } /* error handling */ public Exception lastException() { return this.lastException; } public void resetLastException() { this.lastException = null; } protected void addException(String _error) { // TODO: keep old exceptions? // TODO: SOPE has some elaborate error tracking which we might want to add this.log.debug("exception: " + _error); this.lastException = new Exception(_error); } /* parsing */ protected WOElement _parseElement() { boolean isDebugOn = this.log.isDebugEnabled(); if (this.idx >= this.len) /* EOF */ return null; if (this._isHashTag()) { /* start parsing of dynamic content */ if (isDebugOn) this.log.debug("detected hash element ..."); return this._parseHashElement(); } if (this._isHashCloseTag()) { this.log.warn("unexpected hash close tag ()"); // TODO: in SOPE we raise an exception } if (this._isWOTag()) { /* start parsing of dynamic content */ if (isDebugOn) this.log.debug("detected WO element ..."); return this._parseWOElement(); } if (this._isWOCloseTag()) { this.log.warn("unexpected WEBOBJECT close tag ()"); // TODO: in SOPE we raise an exception } /* parse text/tag content */ int startPos = this.idx; while (this.idx < this.len) { /* scan until we find a tag marker '<' */ while ((this.idx < this.len) && (this.buffer[this.idx] != '<')) this.idx++; if (this.idx >= this.len) /* EOF was reached */ break; /* check whether its a tag which we parse */ if (_isHashTag()) /* found Hash */ break; if (_isHashCloseTag()) break; if (_isWOTag()) /* found Hash */ break; if (_isWOCloseTag()) break; if (this._isComment()) { if (isDebugOn) this.log.debug("detected comment ..."); this.idx += 3; // skip '<--' while (this.idx < this.len) { if (this.buffer[this.idx] == '-') { if (this.idx + 2 < this.len) { if ((this.buffer[this.idx + 1] == '-') && (this.buffer[this.idx + 2] == '>')) { // found '-->' this.idx += 3; // skip '-->' break; } } } this.idx++; } if (this.idx >= this.len) // EOF was reached break; } else { if (isDebugOn) this.log.debug("read regular tag ..."); // skip '<', read usual tag this.idx++; if (this.idx >= this.len) { // EOF was reached with opening '<' this.log.warn("reached EOF with '<' at end !"); break; } if (skipPlainTags) { /* skip until end of HTML tag (not #-tag) */ do { this.idx++; } while ((this.buffer[this.idx] != '>') && (this.idx < this.len)); if (this.idx >= this.len) break; // EOF } this.idx++; } } if (this.idx - startPos <= 0) { if (isDebugOn) this.log.debug("static string element w/o content ..."); return null; } String s = new String(this.buffer, startPos, this.idx - startPos); if (isDebugOn) this.log.debug("static string, length: " + s.length()); return new WOStaticHTMLElement(s); } protected void _skipSpaces() { int pos = this.idx; if (pos >= this.len) return; /* EOF */ while ((pos < this.len) && _isHTMLSpace(this.buffer[pos])) pos++; this.idx = pos; } protected WOElement _parseHashElement() { boolean isDebugOn = this.log.isDebugEnabled(); if (this.idx >= this.len) return null; /* EOF */ if (!this._isHashTag()) return null; /* not a hash tag */ if (isDebugOn) this.log.debug("parse hash element ..."); this.idx += 2; /* skip '<#' */ boolean hadSlashAfterHash = this.buffer[this.idx] == '/'; if (hadSlashAfterHash) { /* a tag starting like this: "<#/", probably an typo */ this.log.error("typo in hash close tag ('<#/' => ' attrs = this._parseTagAttributes(); if (this.lastException != null) return null; // invalid tag attrs if (this.idx >= this.len) { this.addException("unexpected EOF: missing '>' in hash tag (EOF)."); return null; // unexpected EOF } /* parse tag end (> or /) */ if (this.buffer[this.idx] != '>' && this.buffer[this.idx] != '/') { this.addException("missing '>' in hash element tag."); return null; // unexpected EOF } boolean isAutoClose = false; boolean foundEndTag = false; List children = null; if (this.buffer[this.idx] == '>') { /* hashtag is closed */ /* has sub-elements (<#name>...) */ this.idx += 1; // skip '>' if (isDebugOn) this.log.debug(" parsing hash children ..."); while ((this.idx < this.len) && (this.lastException == null)) { WOElement subElement = null; if (this._isHashCloseTag()) { foundEndTag = true; break; } subElement = this._parseElement(); if (subElement != null) { if (children == null) children = new ArrayList(16); children.add(subElement); } } } else { /* is an empty tag (<#name/>) */ /* has no sub-elements (<#name/>) */ if (isDebugOn) this.log.debug(" is autoclose hash-tag ..."); this.idx += 1; // skip '/' isAutoClose = true; if (this.buffer[this.idx] != '>') { this.addException("missing '>' in hash element tag."); return null; // unexpected EOF } this.idx += 1; // skip '>' } /* produce elements */ if (name.length() < 1) { this.addException("missing name in hash element tag."); return null; } Map nameDict = new HashMap(1); nameDict.put("NAME", name); if (attrs != null) attrs.putAll(nameDict); WOElement element = this.handler.dynamicElementWithName (name, (attrs != null ? attrs : nameDict), children); nameDict = null; if (isDebugOn) this.log.debug(" hash element: " + element); if (element == null) { // build error this.addException("could not build hash element: " + name); return null; } if (!foundEndTag && !isAutoClose) { this.addException("did not find hash end tag () .."); return null; } else if (!isAutoClose) { /* skip close tag ('') */ if (!this._isHashCloseTag()) this.log.error("invalid parser state .."); this.idx += 3; // skip '')) this.idx += 1; this.idx += 1; // skip '>' } return element; } protected WOElement _parseWOElement() { boolean isDebugOn = this.log.isDebugEnabled(); if (this.idx >= this.len) return null; /* EOF */ if (!this._isWOTag()) return null; /* not a WEBOBJECT tag */ if (isDebugOn) this.log.debug("parse WEBOBJECT element ..."); this.idx += 10; /* skip ' attrs = this._parseTagAttributes(); if (this.lastException != null || attrs == null) return null; // invalid tag attrs String name = null; if ((name = attrs.get("NAME")) == null) name = attrs.get("name"); if (name == null) { this.addException("missing name in WEBOBJECT element tag."); return null; } if (name.length() < 1) { this.addException("missing name in WEBOBJECT element tag."); return null; } if (this.idx >= this.len) { this.addException("unexpected EOF: missing '>' in WEBOBJECT tag (EOF)."); return null; // unexpected EOF } /* parse tag end '>' */ if (this.buffer[this.idx] != '>') { this.addException("missing '>' in WEBOBJECT element tag."); return null; // unexpected EOF } this.idx += 1; // skip '>' boolean foundEndTag = false; List children = null; if (isDebugOn) this.log.debug(" parsing WEBOBJECT children ..."); while ((this.idx < this.len) && (this.lastException == null)) { WOElement subElement = null; if (this._isWOCloseTag()) { foundEndTag = true; break; } subElement = this._parseElement(); if (subElement != null) { if (children == null) children = new ArrayList(16); children.add(subElement); } } /* produce elements */ WOElement element = this.handler.dynamicElementWithName(name, attrs, children); if (isDebugOn) this.log.debug(" WEBOBJECT element: " + element); if (element == null) { // build error this.addException("could not build WEBOBJECT element !."); return null; } if (!foundEndTag) { this.addException("did not find WEBOBJECT end tag () .."); return null; } /* skip close tag ('') */ if (!this._isWOCloseTag()) this.log.error("invalid parser state .."); this.idx += 11; // skip '' return element; } protected Map _parseTagAttributes() { this._skipSpaces(); if (this.idx >= this.len) return null; /* EOF */ Map attrs = null; do { this._skipSpaces(); if (this.idx >= this.len) break; /* EOF */ /* read key */ String key = this._parseStringValue(); if (key == null) /* ended */ break; /* The following parses: space* '=' space* */ this._skipSpaces(); if (this.idx >= this.len) { /* EOF */ this.addException("expected '=' after key in attributes .."); break; /* unexpected EOF */ } if (this.buffer[this.idx] != '=') { this.addException("expected '=' after key in attributes .."); break; } this.idx++; /* skip '=' */ this._skipSpaces(); if (this.idx >= this.len) { /* EOF */ this.addException("expected value after key in attributes .."); break; /* unexpected EOF */ } /* read value */ String value = this._parseStringValue(); if (value == null) { this.addException("expected value after key in attributes .."); break; /* unexpected EOF */ } /* add to Map */ if (attrs == null) attrs = new HashMap(2); attrs.put(key, value); } while (this.idx < this.len); return attrs; } protected String _parseStringValue() { this._skipSpaces(); int pos = this.idx; if (pos >= this.len) return null; /* EOF */ char c = this.buffer[pos]; if (c == '>' || c == '/' || c == '=') return null; if (this.buffer[pos] == '"') { /* quoted string */ pos++; /* skip starting quote ('"') */ int ilen = 0; int startPos = pos; /* loop until closing quote */ while ((pos < this.len) && (this.buffer[pos] != '"')) { pos++; ilen++; } if (pos == this.len) { /* syntax error, quote not closed */ this.idx = pos; this.addException("quoted string not closed (expected '\"')"); return null; } pos++; /* skip closing quote */ this.idx = pos; /* store pointer */ if (ilen == 0) /* empty string */ return ""; return new String(this.buffer, startPos, ilen); } /* string without quotes */ int startPos = pos; if (pos >= this.len) return null; /* EOF */ /* loop until '>' or '=' or '/' or space */ c = this.buffer[pos]; while ((c != '>' && c != '=' && c != '/') && !_isHTMLSpace(c)) { pos++; if (pos >= this.len) break; c = this.buffer[pos]; } this.idx = pos; if ((pos - startPos) == 0) /* wasn't a string .. */ return null; return new String(this.buffer, startPos, pos - startPos); } /* lookahead */ protected boolean _isComment() { /* checks whether a comment is upcoming (