yacy_search_server/source/net/yacy/document/parser/augment/AugmentParser.java

package net.yacy.document.parser.augment;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import net.yacy.data.ymark.YMarkUtil;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.rdfa.impl.RDFaParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;


public class AugmentParser extends AbstractParser implements Parser {

    RDFaParser rdfaParser;

	public AugmentParser() {
		super("AugmentParser");
		this.rdfaParser = new RDFaParser();

		Log.logInfo("AugmentedParser", "augmented parser was initialized");

		this.SUPPORTED_EXTENSIONS.add("html");
		this.SUPPORTED_EXTENSIONS.add("php");
		this.SUPPORTED_MIME_TYPES.add("text/html");
		this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
		this.SUPPORTED_EXTENSIONS.add("html");
		this.SUPPORTED_EXTENSIONS.add("htm");
	}

	@Override
	public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {

            Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);
            try {
                source.reset();
            } catch (IOException e) {
                Log.logException(e);
            }

            Document alreadyParsedDocument = htmlDocs[0];
            Document superDoc = analyze(alreadyParsedDocument, url, mimeType, charset);
            Document augmentDoc = parseAndAugment(url, mimeType, charset);
            Document[] retDocs = new Document[htmlDocs.length + 1];
            for (int i = 1; i < htmlDocs.length; i++) {
                retDocs[i - 1] = htmlDocs[i];
            }

            retDocs[retDocs.length - 1] = augmentDoc;
            retDocs[retDocs.length - 2] = superDoc;
            try { // merge additional result docs into the parse main document
                alreadyParsedDocument.addSubDocuments(retDocs);
            } catch (IOException ex) {
                Log.logException(ex);
            }
            Document[] finalretDocs = new Document[1]; // return the merged document
            finalretDocs[0] = alreadyParsedDocument;
            return finalretDocs;
	}

	private static Document analyze (Document alreadyParsedDocument, DigestURI url,
			String mimeType, String charset) {

		Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
				"", null, "", 0, 0, null, null, null, null, false);

		// if the magic word appears in the document, perform extra actions.
		if (alreadyParsedDocument.getKeywords().contains("magicword")) {
			String all = "";
			all = "yacylatest";
			newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
					"", null, "", 0, 0, all, null, null, null, false);
		}

		return newDoc;
	}

	private Document parseAndAugment(DigestURI url, String mimeType, String charset) {

		String all = "";
		Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
				"", null, "", 0, 0, all, null, null, null, false);

		Iterator<net.yacy.kelondro.blob.Tables.Row> it;
		try {
			it = Switchboard.getSwitchboard().tables.iterator("aggregatedtags");
			it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator();
			while (it.hasNext()) {
				net.yacy.kelondro.blob.Tables.Row r = it.next();
				if (r.get("url", "").equals (url.toNormalform(false))) {
					Set<String> tags = new HashSet<String>();
					for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) {
						tags.add(s);
					}
					newDoc.addTags(tags);
				}
			}

		} catch (IOException e) {
			Log.logException(e);
		}

		return newDoc;
	}


}
- fix: with augmented parsing = on; missing metadata in index (like title) due to overwriting metadata by adding multiple result docs from augmentparser with same url - fix Document.addsubdocuments: sections might be initialized as Arrays.toList which does not provide the used .addAll methode see e.g. http://kamleshkr.wordpress.com/2010/02/17/inside-java-arrays-aslistt-a/ 12 years ago			`package net.yacy.document.parser.augment;`

			`import java.io.IOException;`
			`import java.io.InputStream;`
			`import java.util.HashSet;`
			`import java.util.Iterator;`
			`import java.util.Set;`

			`import net.yacy.data.ymark.YMarkUtil;`
			`import net.yacy.document.AbstractParser;`
			`import net.yacy.document.Document;`
			`import net.yacy.document.Parser;`
			`import net.yacy.document.parser.rdfa.impl.RDFaParser;`
			`import net.yacy.kelondro.data.meta.DigestURI;`
			`import net.yacy.kelondro.logging.Log;`
			`import net.yacy.search.Switchboard;`


			`public class AugmentParser extends AbstractParser implements Parser {`

			`RDFaParser rdfaParser;`

			`public AugmentParser() {`
			`super("AugmentParser");`
			`this.rdfaParser = new RDFaParser();`

			`Log.logInfo("AugmentedParser", "augmented parser was initialized");`

			`this.SUPPORTED_EXTENSIONS.add("html");`
			`this.SUPPORTED_EXTENSIONS.add("php");`
			`this.SUPPORTED_MIME_TYPES.add("text/html");`
			`this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");`
			`this.SUPPORTED_EXTENSIONS.add("html");`
			`this.SUPPORTED_EXTENSIONS.add("htm");`
			`}`

			`@Override`
			`public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {`

			`Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);`
			`try {`
			`source.reset();`
			`} catch (IOException e) {`
			`Log.logException(e);`
			`}`

			`Document alreadyParsedDocument = htmlDocs[0];`
			`Document superDoc = analyze(alreadyParsedDocument, url, mimeType, charset);`
			`Document augmentDoc = parseAndAugment(url, mimeType, charset);`
			`Document[] retDocs = new Document[htmlDocs.length + 1];`
			`for (int i = 1; i < htmlDocs.length; i++) {`
			`retDocs[i - 1] = htmlDocs[i];`
			`}`

			`retDocs[retDocs.length - 1] = augmentDoc;`
			`retDocs[retDocs.length - 2] = superDoc;`
			`try { // merge additional result docs into the parse main document`
			`alreadyParsedDocument.addSubDocuments(retDocs);`
			`} catch (IOException ex) {`
			`Log.logException(ex);`
			`}`
			`Document[] finalretDocs = new Document[1]; // return the merged document`
			`finalretDocs[0] = alreadyParsedDocument;`
			`return finalretDocs;`
			`}`

			`private static Document analyze (Document alreadyParsedDocument, DigestURI url,`
			`String mimeType, String charset) {`

			`Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",`
			`"", null, "", 0, 0, null, null, null, null, false);`

			`// if the magic word appears in the document, perform extra actions.`
			`if (alreadyParsedDocument.getKeywords().contains("magicword")) {`
			`String all = "";`
			`all = "yacylatest";`
			`newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",`
			`"", null, "", 0, 0, all, null, null, null, false);`
			`}`

			`return newDoc;`
			`}`

			`private Document parseAndAugment(DigestURI url, String mimeType, String charset) {`

			`String all = "";`
			`Document newDoc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",`
refactoring and new usage of SentenceReader: this class appeared as one of the major CPU users during snippet verification. The class was not efficient for two reasons: - it used a too complex input stream; generated from sources and UTF8 byte-conversions. The BufferedReader applied a strong overhead. - to feed data into the SentenceReader, multiple toString/getBytes had been applied until a buffered Reader from an input stream was possible. These superfluous conversions had been removed. - the best source for the Sentence Reader is a String. Therefore the production of Strings had been forced inside the Document class. 13 years ago			`"", null, "", 0, 0, all, null, null, null, false);`
- fix: with augmented parsing = on; missing metadata in index (like title) due to overwriting metadata by adding multiple result docs from augmentparser with same url - fix Document.addsubdocuments: sections might be initialized as Arrays.toList which does not provide the used .addAll methode see e.g. http://kamleshkr.wordpress.com/2010/02/17/inside-java-arrays-aslistt-a/ 12 years ago
			`Iterator<net.yacy.kelondro.blob.Tables.Row> it;`
			`try {`
			`it = Switchboard.getSwitchboard().tables.iterator("aggregatedtags");`
			`it = Switchboard.getSwitchboard().tables.orderBy(it, -1, "timestamp_creation").iterator();`
			`while (it.hasNext()) {`
			`net.yacy.kelondro.blob.Tables.Row r = it.next();`
			`if (r.get("url", "").equals (url.toNormalform(false))) {`
			`Set<String> tags = new HashSet<String>();`
			`for (String s : YMarkUtil.keysStringToSet(r.get("scitag", ""))) {`
			`tags.add(s);`
			`}`
			`newDoc.addTags(tags);`
			`}`
			`}`

			`} catch (IOException e) {`
			`Log.logException(e);`
			`}`

small changes Conflicts: source/net/yacy/document/parser/augment/AugmentParser.java source/net/yacy/interaction/Interaction.java 13 years ago			`return newDoc;`
- fix: with augmented parsing = on; missing metadata in index (like title) due to overwriting metadata by adding multiple result docs from augmentparser with same url - fix Document.addsubdocuments: sections might be initialized as Arrays.toList which does not provide the used .addAll methode see e.g. http://kamleshkr.wordpress.com/2010/02/17/inside-java-arrays-aslistt-a/ 12 years ago			`}`


			`}`