diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index b63bb3b1f..773d2ba3c 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -79,7 +79,7 @@ public class Document { private List titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result private final StringBuilder creator; // author or copyright private final String publisher; // publisher - private List sections; // if present: more titles/headlines appearing in the document + private final List sections; // if present: more titles/headlines appearing in the document private final StringBuilder description; // an abstract, if present: short content description private Object text; // the clear text, all that is visible private final Map anchors; // all links embedded as clickeable entities (anchor tags) @@ -116,9 +116,10 @@ public class Document { this.parserObject = parserObject; this.keywords = new LinkedList(); if (keywords != null) this.keywords.addAll(Arrays.asList(keywords)); - this.titles = titles; + this.titles = (titles == null) ? new ArrayList(1) : titles; this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); - this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections); + this.sections = new LinkedList() ; + if (sections != null) this.sections.addAll(Arrays.asList(sections)); this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct); this.lon = lon; this.lat = lat; @@ -631,17 +632,7 @@ dc_rights public void addSubDocuments(final Document[] docs) throws IOException { for (final Document doc: docs) { - // check class as addAll method might not be available if initialized via Arrays.toList - if (this.sections.getClass() == java.util.LinkedList.class) { - this.sections.addAll(doc.sections); - } else { - /* sections might be initialized via Arrays.toList (which does not implement the addAll method) - so new list must be assigned */ - LinkedList tmplist = new LinkedList(); - tmplist.addAll(this.sections); - tmplist.addAll(doc.sections); - this.sections = tmplist; - } + this.sections.addAll(doc.sections); this.titles.addAll(doc.titles()); this.keywords.addAll(doc.getKeywords()); diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java index 1b32ae3f9..872dfce73 100644 --- a/source/net/yacy/document/parser/augment/AugmentParser.java +++ b/source/net/yacy/document/parser/augment/AugmentParser.java @@ -20,94 +20,70 @@ public class AugmentParser extends AbstractParser implements Parser { RDFaParser rdfaParser; - public AugmentParser() { - super("AugmentParser"); - this.rdfaParser = new RDFaParser(); - - Log.logInfo("AugmentedParser", "augmented parser was initialized"); - - this.SUPPORTED_EXTENSIONS.add("html"); - this.SUPPORTED_EXTENSIONS.add("php"); - this.SUPPORTED_MIME_TYPES.add("text/html"); - this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml"); - this.SUPPORTED_EXTENSIONS.add("html"); - this.SUPPORTED_EXTENSIONS.add("htm"); - } - - @Override - public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { - - Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source); - try { - source.reset(); - } catch (IOException e) { - Log.logException(e); + public AugmentParser() { + super("AugmentParser"); + this.rdfaParser = new RDFaParser(); + + Log.logInfo("AugmentedParser", "augmented parser was initialized"); + + this.SUPPORTED_EXTENSIONS.add("html"); + this.SUPPORTED_EXTENSIONS.add("php"); + this.SUPPORTED_MIME_TYPES.add("text/html"); + this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml"); + this.SUPPORTED_EXTENSIONS.add("html"); + this.SUPPORTED_EXTENSIONS.add("htm"); + } + + @Override + public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { + + Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source); + try { + source.reset(); + } catch (IOException e) { + Log.logException(e); + } + + for (final Document doc : htmlDocs) { + /* analyze(doc, url, mimeType, charset); // enrich document text */ + parseAndAugment(doc, url, mimeType, charset); // enrich document with additional tags + } + return htmlDocs; + } + +/* TODO: not implemented yet + * + private void analyze(Document origDoc, DigestURI url, + String mimeType, String charset) { + // if the magic word appears in the document, perform extra actions. + if (origDoc.getKeywords().contains("magicword")) { + String all = ""; + all = "yacylatest"; + // TODO: append content of string all to origDoc.text, maybe use Document.mergeDocuments() to do so + } + } +*/ + private void parseAndAugment(Document origDoc, DigestURI url, String mimeType, String charset) { + + Iterator it; + try { + it = Switchboard.getSwitchboard().tables.iterator("aggregatedtags"); + it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator(); + while (it.hasNext()) { + net.yacy.kelondro.blob.Tables.Row r = it.next(); + if (r.get("url", "").equals(url.toNormalform(false))) { + Set tags = new HashSet(); + for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) { + tags.add(s); + } + origDoc.addTags(tags); + } } - Document alreadyParsedDocument = htmlDocs[0]; - Document superDoc = analyze(alreadyParsedDocument, url, mimeType, charset); - Document augmentDoc = parseAndAugment(url, mimeType, charset); - Document[] retDocs = new Document[htmlDocs.length + 1]; - for (int i = 1; i < htmlDocs.length; i++) { - retDocs[i - 1] = htmlDocs[i]; - } - - retDocs[retDocs.length - 1] = augmentDoc; - retDocs[retDocs.length - 2] = superDoc; - try { // merge additional result docs into the parse main document - alreadyParsedDocument.addSubDocuments(retDocs); - } catch (IOException ex) { - Log.logException(ex); - } - Document[] finalretDocs = new Document[1]; // return the merged document - finalretDocs[0] = alreadyParsedDocument; - return finalretDocs; - } - - private static Document analyze (Document alreadyParsedDocument, DigestURI url, - String mimeType, String charset) { - - Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "", - "", null, "", 0, 0, null, null, null, null, false); - - // if the magic word appears in the document, perform extra actions. - if (alreadyParsedDocument.getKeywords().contains("magicword")) { - String all = ""; - all = "yacylatest"; - newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "", - "", null, "", 0, 0, all, null, null, null, false); - } - - return newDoc; - } - - private Document parseAndAugment(DigestURI url, String mimeType, String charset) { - - String all = ""; - Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "", - "", null, "", 0, 0, all, null, null, null, false); - - Iterator it; - try { - it = Switchboard.getSwitchboard().tables.iterator("aggregatedtags"); - it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator(); - while (it.hasNext()) { - net.yacy.kelondro.blob.Tables.Row r = it.next(); - if (r.get("url", "").equals (url.toNormalform(false))) { - Set tags = new HashSet(); - for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) { - tags.add(s); - } - newDoc.addTags(tags); - } - } - - } catch (IOException e) { - Log.logException(e); - } - - return newDoc; - } + } catch (IOException e) { + Log.logException(e); + } + } }