- Redesigned crawler and parser to accept embedded links from the NOLOAD

queue and not from virtual documents generated by the parser. - The parser now generates nice description texts for NOLOAD entries which shall make it possible to find media content using the search index and not using the media prefetch algorithm during search (which was costly) - Removed the media-search prefetch process from image search
13 years ago · 659178942f
parent 3bea25c513
commit 659178942f
16 changed files with 100 additions and 110 deletions
--- a/source/de/anomic/crawler/Balancer.java
+++ b/source/de/anomic/crawler/Balancer.java
@ -56,7 +56,7 @@ import de.anomic.crawler.retrieval.Request;
 public class Balancer {
-    private static final String indexSuffix           = "9.db";
+    private static final String indexSuffix           = "A.db";
    private static final int    EcoFSBufferSize       = 1000;
    private static final int    objectIndexBufferSize = 1000;
    private static final String localhost             = "localhost";
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -60,8 +60,8 @@ import de.anomic.crawler.retrieval.Response;
 public class CrawlQueues {
-    private static final String ERROR_DB_FILENAME = "urlError3.db";
+    private static final String ERROR_DB_FILENAME = "urlError4.db";
-    private static final String DELEGATED_DB_FILENAME = "urlDelegated3.db";
+    private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db";
    private static final Segments.Process PROCESS = Segments.Process.LOCALCRAWLING;
    protected Switchboard sb;
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@ -159,7 +159,7 @@ public final class HTTPLoader {
                    // check if the url was already indexed
                    final String dbname = this.sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
-                    if (dbname != null) {
+                    if (dbname != null) { //OTTO
                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code);
                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
                    }
--- a/source/de/anomic/crawler/retrieval/Request.java
+++ b/source/de/anomic/crawler/retrieval/Request.java
@ -53,8 +53,8 @@ public class Request extends WorkflowJob
        + Word.commonHashLength
        + ", "
        + // the url's referrer hash
-        "String urlname-80, "
+        "String urlname-256, "
-        + // the name of the url, from anchor tag <a>name</a>
+        + // the name of the url, from anchor tag <a>name</a> (must be big to transport NOLOAD entries)
        "Cardinal appdate-8 {b256}, "
        + // the date of the resource; either file date or first appearance
        "String profile-"
@ -78,6 +78,8 @@ public class Request extends WorkflowJob
        "Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known
        Base64Order.enhancedCoder);
    public final static int descrLength = rowdef.column(4).cellwidth;
    private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy;
                              // if this is generated by a crawl, the own peer hash in entered
    private byte[] refhash; // the url's referrer hash
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
@ -162,16 +162,23 @@ public class Response {
        this.content = content;
    }
    /**
     * create a 'virtual' response that is composed using crawl details from the request object
     * this is used when the NOLOAD queue is processed
     * @param request
     * @param profile
     */
    public Response(final Request request, final CrawlProfile profile) {
        this.request = request;
        // request and response headers may be zero in case that we process surrogates
        this.requestHeader = new RequestHeader();
        this.responseHeader = new ResponseHeader();
        this.responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); // tell parser how to handle the content
        if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
        this.responseStatus = "200";
        this.profile = profile;
        this.status = QUEUE_STATE_FRESH;
-        this.content = request.url().toTokens().getBytes();
+        this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes();
    }
    public Response(
@ -824,7 +831,7 @@ public class Response {
        final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
        if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
        try {
-            return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content, false);
+            return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content);
        } catch (final Exception e) {
            return null;
        }
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -60,6 +60,7 @@ import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.ByteBuffer;
 import net.yacy.kelondro.util.FileUtils;
 import de.anomic.crawler.retrieval.Request;
 public class Document {
@ -827,7 +828,8 @@ dc_rights
        final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
        for (final Document d: documents) {
            for (final ImageEntry imageReference : d.getImages().values()) {
-                result.put(imageReference.url(), imageReference.alt());
+                // construct a image name which contains the document title to enhance the search process for images
                result.put(imageReference.url(), description(d, imageReference.alt()));
            }
        }
        return result;
@ -835,20 +837,57 @@ dc_rights
    public static Map<MultiProtocolURI, String> getAudiolinks(final Document[] documents) {
        final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
-        for (final Document d: documents) result.putAll(d.audiolinks);
+        for (final Document d: documents) {
            for (Map.Entry<MultiProtocolURI, String> e: d.audiolinks.entrySet()) {
                result.put(e.getKey(), description(d, e.getValue()));
            }
        }
        return result;
    }
    public static Map<MultiProtocolURI, String> getVideolinks(final Document[] documents) {
        final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
-        for (final Document d: documents) result.putAll(d.videolinks);
+        for (final Document d: documents) {
            for (Map.Entry<MultiProtocolURI, String> e: d.videolinks.entrySet()) {
                result.put(e.getKey(), description(d, e.getValue()));
            }
        }
        return result;
    }
    public static Map<MultiProtocolURI, String> getApplinks(final Document[] documents) {
        final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
-        for (final Document d: documents) result.putAll(d.applinks);
+        for (final Document d: documents) {
            for (Map.Entry<MultiProtocolURI, String> e: d.applinks.entrySet()) {
                result.put(e.getKey(), description(d, e.getValue()));
            }
        }
        return result;
    }
    private static final String description(Document d, String tagname) {
        if (tagname == null || tagname.length() == 0) {
            tagname = d.source.toTokens();
        }
        StringBuilder sb = new StringBuilder(60);
        sb.append(d.dc_title());
        if (!d.dc_description().equals(d.dc_title()) && sb.length() < Request.descrLength - tagname.length()) {
            sb.append(' ');
            sb.append(d.dc_description());
        }
        if (sb.length() < Request.descrLength - tagname.length()) {
            sb.append(' ');
            sb.append(d.dc_subject(','));
        }
        if (tagname.length() > 0) {
            if (sb.length() > Request.descrLength - tagname.length() - 3) {
                // cut this off because otherwise the tagname is lost.
                sb.setLength(Request.descrLength - tagname.length() - 3);
            }
            sb.append(" - ");
            sb.append(tagname);
        }
        return sb.toString().trim();
    }
 }
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -31,12 +31,11 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import net.yacy.cora.document.Classification;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.document.UTF8;
 import net.yacy.document.parser.bzipParser;
 import net.yacy.document.parser.csvParser;
 import net.yacy.document.parser.docParser;
@ -60,7 +59,6 @@ import net.yacy.document.parser.vcfParser;
 import net.yacy.document.parser.vsdParser;
 import net.yacy.document.parser.xlsParser;
 import net.yacy.document.parser.zipParser;
 import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.document.parser.images.genericImageParser;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
@ -144,8 +142,7 @@ public final class TextParser {
            final MultiProtocolURI location,
            final String mimeType,
            final String charset,
-            final File sourceFile,
+            final File sourceFile
            final boolean multipleVirtualDocs
        ) throws InterruptedException, Parser.Failure {
        BufferedInputStream sourceStream = null;
@ -158,7 +155,7 @@ public final class TextParser {
                throw new Parser.Failure(errorMsg, location);
            }
            sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
-            docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream, multipleVirtualDocs);
+            docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -176,8 +173,7 @@ public final class TextParser {
            final MultiProtocolURI location,
            String mimeType,
            final String charset,
-            final byte[] content,
+            final byte[] content
            final boolean multipleVirtualDocs
        ) throws Parser.Failure {
        if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
        mimeType = normalizeMimeType(mimeType);
@ -193,9 +189,6 @@ public final class TextParser {
        Document[] docs = parseSource(location, mimeType, idioms, charset, content);
        // finally enrich the docs set with virtual docs from the enclosed documents
        if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]);
        return docs;
    }
@ -204,8 +197,7 @@ public final class TextParser {
            String mimeType,
            final String charset,
            final long contentLength,
-            final InputStream sourceStream,
+            final InputStream sourceStream
            final boolean multipleVirtualDocs
        ) throws Parser.Failure {
        if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
        mimeType = normalizeMimeType(mimeType);
@ -236,9 +228,6 @@ public final class TextParser {
        }
        Document[] docs = parseSource(location, mimeType, idioms, charset, b);
        // finally enrich the docs set with virtual docs from the enclosed documents
        if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]);
        return docs;
    }
@ -281,7 +270,13 @@ public final class TextParser {
        final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
        if (MemoryControl.request(sourceArray.length * 6, false)) {
            for (final Parser parser: parsers) {
-            	ByteArrayInputStream bis = new ByteArrayInputStream(sourceArray);
+            	ByteArrayInputStream bis;
            	if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) {
            	    // a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages.
            	    bis = new ByteArrayInputStream(UTF8.getBytes("<html><head></head><body><h1>" + UTF8.String(sourceArray) + "</h1></body><html>"));
            	} else {
            	    bis = new ByteArrayInputStream(sourceArray);
            	}
                try {
                    docs = parser.parse(location, mimeType, documentCharset, bis);
                } catch (final Parser.Failure e) {
@ -477,73 +472,4 @@ public final class TextParser {
        if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v);
    }
    /**
     * produce virtual documents for each of the link that is contained in the document
     * @param document
     * @return
     */
    public static Document[] virtualDocs(final Document document) {
        final ArrayList<Document> docs = new ArrayList<Document>();
        docs.add(document);
        for (final Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
            docs.add(genLinkDocs("application", link.getKey(), link.getValue(), document.getContentLanguages()));
        }
        for (final Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
            docs.add(genLinkDocs("audio", link.getKey(), link.getValue(), document.getContentLanguages()));
        }
        for (final Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
            docs.add(genLinkDocs("video", link.getKey(), link.getValue(), document.getContentLanguages()));
        }
        for (final Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
            docs.add(genImageDocs(link.getValue()));
        }
        // finally return the list of documents
        return docs.toArray(new Document[docs.size()]);
    }
    private final static Document genLinkDocs(final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) {
        //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
        return new Document(
                uri,
                Classification.ext2mime(uri.getFileExtension()),
                "UTF-8",
                null,
                contentLanguages,
                null,
                descr,
                "",
                "",
                new String[]{descr},
                type,
                0.0f, 0.0f,
                uri.toNormalform(false, false),
                null,
                null,
                null,
                false);
    }
    private final static Document genImageDocs(final ImageEntry img) {
        //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
        return new Document(
                img.url(),
                Classification.ext2mime(img.url().getFileExtension()),
                "UTF-8",
                null,
                null,
                null,
                img.alt(),
                "",
                "",
                new String[]{img.alt()},
                "image",
                0.0f, 0.0f,
                img.url().toNormalform(false, false),
                null,
                null,
                null,
                false);
    }
 }
--- a/source/net/yacy/document/importer/MediawikiImporter.java
+++ b/source/net/yacy/document/importer/MediawikiImporter.java
@ -101,14 +101,17 @@ public class MediawikiImporter extends Thread implements Importer {
        this.urlStub = null;
    }
    @Override
    public int count() {
        return this.count;
    }
    @Override
    public String source() {
        return this.sourcefile.getAbsolutePath();
    }
    @Override
    public String status() {
        return "";
    }
@ -117,6 +120,7 @@ public class MediawikiImporter extends Thread implements Importer {
     * return the number of articles per second
     * @return
     */
    @Override
    public int speed() {
        if (this.count == 0) return 0;
        return (int) (this.count / Math.max(1L, runningTime() ));
@ -126,14 +130,17 @@ public class MediawikiImporter extends Thread implements Importer {
     * return the remaining seconds for the completion of all records in milliseconds
     * @return
     */
    @Override
    public long remainingTime() {
        return Math.max(0, this.approxdocs - this.count) / Math.max(1, speed() );
    }
    @Override
    public long runningTime() {
        return (System.currentTimeMillis() - this.start) / 1000L;
    }
    @Override
    public void run() {
        this.start = System.currentTimeMillis();
        try {
@ -287,6 +294,7 @@ public class MediawikiImporter extends Thread implements Importer {
            this.mediawikixml = mediawikixml;
        }
        @Override
        public void run() {
            try {
                createIndex(this.mediawikixml);
@ -365,6 +373,7 @@ public class MediawikiImporter extends Thread implements Importer {
            }
        }
        @Override
        public Integer call() {
            wikisourcerecord r;
            try {
@ -412,6 +421,7 @@ public class MediawikiImporter extends Thread implements Importer {
            }
        }
        @Override
        public Integer call() {
            wikisourcerecord r;
            wikiraw c;
@ -505,7 +515,7 @@ public class MediawikiImporter extends Thread implements Importer {
        public void genDocument() throws Parser.Failure {
            try {
 				this.url = new DigestURI(this.urlStub + this.title);
-				final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html), false);
+				final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html));
 				this.document = Document.mergeDocuments(this.url, "text/html", parsed);
 				// the wiki parser is not able to find the proper title in the source text, so it must be set here
 				this.document.setTitle(this.title);
@ -626,6 +636,7 @@ public class MediawikiImporter extends Thread implements Importer {
            this.out = out;
        }
        @Override
        public Integer call() {
            wikiparserrecord record;
            try {
@ -682,6 +693,7 @@ public class MediawikiImporter extends Thread implements Importer {
            this.outputfilename = null;
        }
        @Override
        public Integer call() {
            wikiparserrecord record;
            try {
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@ -55,6 +55,7 @@ public class bzipParser extends AbstractParser implements Parser {
        this.SUPPORTED_MIME_TYPES.add("application/x-stuffit");
    }
    @Override
    public Document[] parse(final MultiProtocolURI location, final String mimeType,
            final String charset, final InputStream source)
            throws Parser.Failure, InterruptedException {
@ -93,7 +94,7 @@ public class bzipParser extends AbstractParser implements Parser {
            out.close();
            // creating a new parser class to parse the unzipped content
-            docs = TextParser.parseSource(location, null, null, tempFile, false);
+            docs = TextParser.parseSource(location, null, null, tempFile);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@ -54,6 +54,7 @@ public class gzipParser extends AbstractParser implements Parser {
        this.SUPPORTED_MIME_TYPES.add("gzip/document");
    }
    @Override
    public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
        File tempFile = null;
@ -78,7 +79,7 @@ public class gzipParser extends AbstractParser implements Parser {
            out.close();
            // creating a new parser class to parse the unzipped content
-            docs = TextParser.parseSource(location,null,null,tempFile, false);
+            docs = TextParser.parseSource(location,null,null,tempFile);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
--- a/source/net/yacy/document/parser/sevenzipParser.java
+++ b/source/net/yacy/document/parser/sevenzipParser.java
@ -99,6 +99,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
        }
    }
    @Override
    public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset,
            final InputStream source) throws Parser.Failure, InterruptedException {
        try {
@ -166,7 +167,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
                     // below for reversion of the effects
                     final MultiProtocolURI url = MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath);
                     final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
-                     theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray(), false);
+                     theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());
                     this.doc.addSubDocuments(theDocs);
                 }
--- a/source/net/yacy/document/parser/tarParser.java
+++ b/source/net/yacy/document/parser/tarParser.java
@ -59,6 +59,7 @@ public class tarParser extends AbstractParser implements Parser {
        this.SUPPORTED_MIME_TYPES.add("multipart/x-tar");
    }
    @Override
    public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException {
        final List<Document> docacc = new ArrayList<Document>();
@ -88,7 +89,7 @@ public class tarParser extends AbstractParser implements Parser {
                try {
                    tmp = FileUtils.createTempFile(this.getClass(), name);
                    FileUtils.copy(tis, tmp, entry.getSize());
-                    subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp, false);
+                    subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp);
                    if (subDocs == null) continue;
                    for (final Document d: subDocs) docacc.add(d);
                } catch (final Parser.Failure e) {
--- a/source/net/yacy/document/parser/zipParser.java
+++ b/source/net/yacy/document/parser/zipParser.java
@ -59,6 +59,7 @@ public class zipParser extends AbstractParser implements Parser {
        this.SUPPORTED_MIME_TYPES.add("application/vnd.android.package-archive");
    }
    @Override
    public Document[] parse(final MultiProtocolURI url, final String mimeType,
            final String charset, final InputStream source)
            throws Parser.Failure, InterruptedException {
@ -87,7 +88,7 @@ public class zipParser extends AbstractParser implements Parser {
                    FileUtils.copy(zis, tmp, entry.getSize());
                    final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name);
                    //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
-                    docs = TextParser.parseSource(virtualURL, mime, null, tmp, false);
+                    docs = TextParser.parseSource(virtualURL, mime, null, tmp);
                    if (docs == null) continue;
                    for (final Document d: docs) docacc.add(d);
                } catch (final Parser.Failure e) {
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -386,7 +386,7 @@ public final class LoaderDispatcher {
        final String supportError = TextParser.supports(url, responseHeader.mime());
        if (supportError != null) throw new IOException("no parser support: " + supportError);
        try {
-            documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent(), false);
+            documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent());
            if (documents == null) throw new IOException("document == null");
        } catch (final Exception e) {
            throw new IOException("parser error: " + e.getMessage());
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2329,8 +2329,7 @@ public final class Switchboard extends serverSwitch
                    response.url(),
                    response.getMimeType(),
                    response.getCharacterEncoding(),
-                    response.getContent(),
+                    response.getContent());
                    response.profile().directDocByURL());
            if ( documents == null ) {
                throw new Parser.Failure("Parser returned null.", response.url());
            }
--- a/source/net/yacy/search/index/DocumentIndex.java
+++ b/source/net/yacy/search/index/DocumentIndex.java
@ -150,7 +150,7 @@ public class DocumentIndex extends Segment
            length = -1;
        }
        try {
-            documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1), true);
+            documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1));
        } catch ( final Exception e ) {
            throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
        }