Merge branch 'master' of git://gitorious.org/~reger/yacy/bbyacy-rc1

13 years ago · d88eb657fd
parent 19d1f474ce 722a447b0d
commit d88eb657fd
2 changed files with 67 additions and 100 deletions
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -79,7 +79,7 @@ public class Document {
    private       List<String> titles;          // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
    private final StringBuilder creator;        // author or copyright
    private final String publisher;             // publisher
-    private       List<String>  sections;       // if present: more titles/headlines appearing in the document
+    private final List<String>  sections;       // if present: more titles/headlines appearing in the document
    private final StringBuilder description;    // an abstract, if present: short content description
    private Object text;                        // the clear text, all that is visible
    private final Map<MultiProtocolURI, Properties> anchors; // all links embedded as clickeable entities (anchor tags)
@ -116,9 +116,10 @@ public class Document {
        this.parserObject = parserObject;
        this.keywords = new LinkedList<String>();
        if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
-        this.titles = titles;
+        this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
        this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
-        this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
+        this.sections =  new LinkedList<String>() ;
+        if (sections != null) this.sections.addAll(Arrays.asList(sections));
        this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
        this.lon = lon;
        this.lat = lat;
@ -631,17 +632,7 @@ dc_rights

    public void addSubDocuments(final Document[] docs) throws IOException {
        for (final Document doc: docs) {
-            // check class as addAll method might not be available if initialized via Arrays.toList
-            if (this.sections.getClass() == java.util.LinkedList.class) {
-                this.sections.addAll(doc.sections);
-            } else {
-                /* sections might be initialized via Arrays.toList (which does not implement the addAll method)
-                   so new list must be assigned */
-                LinkedList<String> tmplist = new LinkedList<String>();
-                tmplist.addAll(this.sections);
-                tmplist.addAll(doc.sections);
-                this.sections = tmplist;
-            }
+            this.sections.addAll(doc.sections);
            this.titles.addAll(doc.titles());
            this.keywords.addAll(doc.getKeywords());

--- a/source/net/yacy/document/parser/augment/AugmentParser.java
+++ b/source/net/yacy/document/parser/augment/AugmentParser.java
@ -20,94 +20,70 @@ public class AugmentParser extends AbstractParser implements Parser {

    RDFaParser rdfaParser;

-	public AugmentParser() {
-		super("AugmentParser");
-		this.rdfaParser = new RDFaParser();
-
-		Log.logInfo("AugmentedParser", "augmented parser was initialized");
-
-		this.SUPPORTED_EXTENSIONS.add("html");
-		this.SUPPORTED_EXTENSIONS.add("php");
-		this.SUPPORTED_MIME_TYPES.add("text/html");
-		this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
-		this.SUPPORTED_EXTENSIONS.add("html");
-		this.SUPPORTED_EXTENSIONS.add("htm");
-	}
-
-	@Override
-	public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
-
-            Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);
-            try {
-                source.reset();
-            } catch (IOException e) {
-                Log.logException(e);
+    public AugmentParser() {
+        super("AugmentParser");
+        this.rdfaParser = new RDFaParser();
+
+        Log.logInfo("AugmentedParser", "augmented parser was initialized");
+
+        this.SUPPORTED_EXTENSIONS.add("html");
+        this.SUPPORTED_EXTENSIONS.add("php");
+        this.SUPPORTED_MIME_TYPES.add("text/html");
+        this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
+        this.SUPPORTED_EXTENSIONS.add("html");
+        this.SUPPORTED_EXTENSIONS.add("htm");
+    }
+
+    @Override
+    public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException {
+
+        Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);
+        try {
+            source.reset();
+        } catch (IOException e) {
+            Log.logException(e);
+        }
+
+        for (final Document doc : htmlDocs) {
+            /* analyze(doc, url, mimeType, charset);  // enrich document text */
+            parseAndAugment(doc, url, mimeType, charset); // enrich document with additional tags
+        }
+        return htmlDocs;
+    }
+
+/*  TODO: not implemented yet
+ *
+    private void analyze(Document origDoc, DigestURI url,
+            String mimeType, String charset) {
+        // if the magic word appears in the document, perform extra actions.
+        if (origDoc.getKeywords().contains("magicword")) {
+            String all = "";
+            all = "yacylatest";
+            // TODO: append content of string all to origDoc.text, maybe use Document.mergeDocuments() to do so
+        }
+    }
+*/
+    private void parseAndAugment(Document origDoc, DigestURI url, String mimeType, String charset) {
+
+        Iterator<net.yacy.kelondro.blob.Tables.Row> it;
+        try {
+            it = Switchboard.getSwitchboard().tables.iterator("aggregatedtags");
+            it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator();
+            while (it.hasNext()) {
+                net.yacy.kelondro.blob.Tables.Row r = it.next();
+                if (r.get("url", "").equals(url.toNormalform(false))) {
+                    Set<String> tags = new HashSet<String>();
+                    for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) {
+                        tags.add(s);
+                    }
+                    origDoc.addTags(tags);
+                }
            }

-            Document alreadyParsedDocument = htmlDocs[0];
-            Document superDoc = analyze(alreadyParsedDocument, url, mimeType, charset);
-            Document augmentDoc = parseAndAugment(url, mimeType, charset);
-            Document[] retDocs = new Document[htmlDocs.length + 1];
-            for (int i = 1; i < htmlDocs.length; i++) {
-                retDocs[i - 1] = htmlDocs[i];
-            }
-
-            retDocs[retDocs.length - 1] = augmentDoc;
-            retDocs[retDocs.length - 2] = superDoc;
-            try { // merge additional result docs into the parse main document
-                alreadyParsedDocument.addSubDocuments(retDocs);
-            } catch (IOException ex) {
-                Log.logException(ex);
-            }
-            Document[] finalretDocs = new Document[1]; // return the merged document
-            finalretDocs[0] = alreadyParsedDocument;
-            return finalretDocs;
-	}
-
-	private static Document analyze (Document alreadyParsedDocument, DigestURI url,
-			String mimeType, String charset) {
-
-		Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
-				"", null, "", 0, 0, null, null, null, null, false);
-
-		// if the magic word appears in the document, perform extra actions.
-		if (alreadyParsedDocument.getKeywords().contains("magicword")) {
-			String all = "";
-			all = "yacylatest";
-			newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
-					"", null, "", 0, 0, all, null, null, null, false);
-		}
-
-		return newDoc;
-	}
-
-	private Document parseAndAugment(DigestURI url, String mimeType, String charset) {
-
-		String all = "";
-		Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
-				"", null, "", 0, 0, all, null, null, null, false);
-
-		Iterator<net.yacy.kelondro.blob.Tables.Row> it;
-		try {
-			it = Switchboard.getSwitchboard().tables.iterator("aggregatedtags");
-			it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator();
-			while (it.hasNext()) {
-				net.yacy.kelondro.blob.Tables.Row r = it.next();
-				if (r.get("url", "").equals (url.toNormalform(false))) {
-					Set<String> tags = new HashSet<String>();
-					for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) {
-						tags.add(s);
-					}
-					newDoc.addTags(tags);
-				}
-			}
-
-		} catch (IOException e) {
-			Log.logException(e);
-		}
-
-		return newDoc;
-	}
+        } catch (IOException e) {
+            Log.logException(e);
+        }
+    }


 }