small changes

Conflicts: source/net/yacy/document/parser/augment/AugmentParser.java source/net/yacy/interaction/Interaction.java
13 years ago · 5f8ba7f4f2
parent 09a34cfe1b
commit 5f8ba7f4f2
2 changed files with 826 additions and 791 deletions
--- a/source/net/yacy/document/parser/augment/AugmentParser.java
+++ b/source/net/yacy/document/parser/augment/AugmentParser.java
@ -1,70 +1,67 @@
 package net.yacy.document.parser.augment;

-import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.net.MalformedURLException;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;

-import net.yacy.yacy;
 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
-import net.yacy.document.parser.htmlParser;
-import net.yacy.document.parser.rdfa.IRDFaTriple;
 import net.yacy.document.parser.rdfa.impl.RDFaParser;
-import net.yacy.document.parser.rdfa.impl.RDFaTripleImpl;
 import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.logging.Log;
+import net.yacy.search.Switchboard;
+import de.anomic.data.ymark.YMarkUtil;


 public class AugmentParser extends RDFaParser {

 	public AugmentParser(String name) {
 		super(name);
-		
+
 		System.out.println("augmented parser was initialized");

-		SUPPORTED_EXTENSIONS.remove("htm");
-		SUPPORTED_EXTENSIONS.remove("html");
-		SUPPORTED_EXTENSIONS.remove("shtml");
-		SUPPORTED_EXTENSIONS.remove("xhtml");
-		SUPPORTED_EXTENSIONS.remove("php");
-		SUPPORTED_EXTENSIONS.remove("php3");
-		SUPPORTED_EXTENSIONS.remove("php4");
-		SUPPORTED_EXTENSIONS.remove("php5");
-		SUPPORTED_EXTENSIONS.remove("cfm");
-		SUPPORTED_EXTENSIONS.remove("asp");
-		SUPPORTED_EXTENSIONS.remove("aspx");
-		SUPPORTED_EXTENSIONS.remove("tex");
-		SUPPORTED_EXTENSIONS.remove("txt");
-		SUPPORTED_EXTENSIONS.remove("jsp");
-		SUPPORTED_EXTENSIONS.remove("mf");
-		SUPPORTED_EXTENSIONS.remove("pl");
-		SUPPORTED_EXTENSIONS.remove("py");
-		SUPPORTED_MIME_TYPES.remove("text/html");
-		SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
-		SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
-		SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
-		SUPPORTED_MIME_TYPES.remove("application/x-tex");
-		SUPPORTED_MIME_TYPES.remove("text/plain");
-		SUPPORTED_MIME_TYPES.remove("text/sgml");
-		SUPPORTED_MIME_TYPES.remove("text/csv");
-		
-		SUPPORTED_EXTENSIONS.add("html");
-		SUPPORTED_EXTENSIONS.add("php");
-		SUPPORTED_MIME_TYPES.add("text/html");
-		SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
-		SUPPORTED_EXTENSIONS.add("html");
-		SUPPORTED_EXTENSIONS.add("htm");
+		this.SUPPORTED_EXTENSIONS.remove("htm");
+		this.SUPPORTED_EXTENSIONS.remove("html");
+		this.SUPPORTED_EXTENSIONS.remove("shtml");
+		this.SUPPORTED_EXTENSIONS.remove("xhtml");
+		this.SUPPORTED_EXTENSIONS.remove("php");
+		this.SUPPORTED_EXTENSIONS.remove("php3");
+		this.SUPPORTED_EXTENSIONS.remove("php4");
+		this.SUPPORTED_EXTENSIONS.remove("php5");
+		this.SUPPORTED_EXTENSIONS.remove("cfm");
+		this.SUPPORTED_EXTENSIONS.remove("asp");
+		this.SUPPORTED_EXTENSIONS.remove("aspx");
+		this.SUPPORTED_EXTENSIONS.remove("tex");
+		this.SUPPORTED_EXTENSIONS.remove("txt");
+		this.SUPPORTED_EXTENSIONS.remove("jsp");
+		this.SUPPORTED_EXTENSIONS.remove("mf");
+		this.SUPPORTED_EXTENSIONS.remove("pl");
+		this.SUPPORTED_EXTENSIONS.remove("py");
+		this.SUPPORTED_MIME_TYPES.remove("text/html");
+		this.SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
+		this.SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
+		this.SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
+		this.SUPPORTED_MIME_TYPES.remove("application/x-tex");
+		this.SUPPORTED_MIME_TYPES.remove("text/plain");
+		this.SUPPORTED_MIME_TYPES.remove("text/sgml");
+		this.SUPPORTED_MIME_TYPES.remove("text/csv");
+
+		this.SUPPORTED_EXTENSIONS.add("html");
+		this.SUPPORTED_EXTENSIONS.add("php");
+		this.SUPPORTED_MIME_TYPES.add("text/html");
+		this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
+		this.SUPPORTED_EXTENSIONS.add("html");
+		this.SUPPORTED_EXTENSIONS.add("htm");
 	}

 	@Override
 	public Document[] parse(MultiProtocolURI url, String mimeType,
 			String charset, InputStream source) throws Failure,
 			InterruptedException {
-		
+
 		Document[] htmlDocs = super.parse(url, mimeType, charset, source);
 		try {
 			source.reset();
@ -72,9 +69,9 @@ public class AugmentParser extends RDFaParser {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
-		
+
 		String urlHash = String.valueOf(url.hashCode());
-		
+
 		DigestURI durl;
 		try {
 			durl = new DigestURI(MultiProtocolURI.unescape(url.toString()));
@ -83,53 +80,92 @@ public class AugmentParser extends RDFaParser {
 			// TODO Auto-generated catch block
 			e1.printStackTrace();
 		}
-        
-		Document theDoc = htmlDocs[0];
-		
-		
-		Document superDoc = new Document(url, mimeType, charset, null, null, null, "", "",
-				"", null, "", 0, 0, null, null, null, null, false);
-		
-		// if the magic word appears in the document, perform extra actions.
-		
-		
-//		if (htmlDocs[0].getKeywords().contains("magicword")) {		
-//			String all = "";
-//			
-//			all = "yacylatest";
-//			superDoc = new Document(url, mimeType, charset, null, null, null, "", "",
-//					"", null, "", 0, 0, all.getBytes(), null, null, null, false);
-//		}
-			
+
+		Document alreadyParsedDocument = htmlDocs[0];
+
+		Document superDoc = analyze(alreadyParsedDocument, url, mimeType, charset, source);
+
+
+
 		Document augmentDoc = parseAndAugment(url, mimeType, charset, source);
-		
-		
+
+
 		Document[] retDocs = new Document[htmlDocs.length + 2];
 		for (int i = 0; i < htmlDocs.length; i++) {
 			retDocs[i] = htmlDocs[i];
 		}
-		
+
 		retDocs[retDocs.length - 1] = augmentDoc;
 		retDocs[retDocs.length - 2] = superDoc;
-	 
+
 		return retDocs;
-	
-	}	
-	
+
+	}
+
+	private Document analyze (Document alreadyParsedDocument, MultiProtocolURI url,
+			String mimeType, String charset, InputStream source) {
+
+		Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
+				"", null, "", 0, 0, null, null, null, null, false);
+
+		// if the magic word appears in the document, perform extra actions.
+
+
+		if (alreadyParsedDocument.getKeywords().contains("magicword")) {
+			String all = "";
+
+			all = "yacylatest";
+			newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
+					"", null, "", 0, 0, all.getBytes(), null, null, null, false);
+		}
+
+		return newDoc;
+	}
+

 	private Document parseAndAugment(MultiProtocolURI url,
 			String mimeType, String charset, InputStream source) {

 		String all = "";
-		
-		// add even more information to the document in external routines. 
-		
-//		all = "augmented";
-	
-		Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
-				"", null, "", 0, 0, all.getBytes(), null, null, null, false);
-		return doc;
+
+		Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
+				"", null, "", 0, 0, all.getBytes(), null, null, null, false);
+
+
+		Iterator<net.yacy.kelondro.blob.Tables.Row> it;
+		try {
+			it = Switchboard.getSwitchboard().tables.iterator("aggregatedtags");
+
+			it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator();
+
+			while (it.hasNext()) {
+				net.yacy.kelondro.blob.Tables.Row r = it.next();
+
+				if (r.get("url", "").equals (url.toNormalform(false, false))) {
+
+					Set<String> tags = new HashSet<String>();
+
+					for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) {
+
+						tags.add(s);
+
+					}
+
+
+					newDoc.addTags(tags);
+
+				}
+			}
+
+
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+
+
+		return newDoc;
 	}
-	
+

 }
--- a/source/net/yacy/interaction/Interaction.java
+++ b/source/net/yacy/interaction/Interaction.java