next refactoring step in document indexing to prepare concurrency environment for document parsing

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4604 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · 9b0e20fb06
parent 7f9f639d20
commit 9b0e20fb06
5 changed files with 153 additions and 8 deletions
--- a/source/de/anomic/index/indexRepositoryReference.java
+++ b/source/de/anomic/index/indexRepositoryReference.java
@ -54,7 +54,7 @@ import de.anomic.yacy.yacyURL;
 public final class indexRepositoryReference {
    // class objects
-    private kelondroIndex urlIndexFile;
+    kelondroIndex urlIndexFile;
    private Export exportthread = null; // will habe a export thread assigned if exporter is running
    public indexRepositoryReference(File indexSecondaryRoot, String networkName) {
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -54,6 +54,7 @@ import de.anomic.server.serverFileUtils;
 import de.anomic.yacy.yacyURL;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedList;
@ -86,6 +87,7 @@ public class plasmaParserDocument {
    private yacyURL favicon;
    private boolean resorted;
    private InputStream textStream;
    private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
    protected plasmaParserDocument(yacyURL location, String mimeType, String charset,
                    String[] keywords, String title, String author,
@ -107,6 +109,8 @@ public class plasmaParserDocument {
        this.applinks = null;
        this.emaillinks = null;
        this.resorted = false;
        this.inboundLinks = -1;
        this.outboundLinks = -1;
        if (text == null) try {
            this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
@ -430,6 +434,22 @@ dc_rights
    	this.favicon = faviconURL;
    }
    public void notifyWebStructure(plasmaWebStructure webStructure, plasmaCondenser condenser, Date docDate) {
        Integer[] ioLinks = webStructure.generateCitationReference(this, condenser, docDate); // [outlinksSame, outlinksOther]
        this.inboundLinks = ioLinks[0].intValue();
        this.outboundLinks = ioLinks[1].intValue();
    }
    public int inboundLinks() {
        assert this.inboundLinks >= 0;
        return (this.inboundLinks < 0) ? 0 : this.inboundLinks;
    }
    public int outboundLinks() {
        assert this.outboundLinks >= 0;
        return (this.outboundLinks < 0) ? 0 : this.outboundLinks;
    }
    public void close() {
        // try close the output stream
        if (this.textStream != null) {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1862,7 +1862,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
            if (document != null) {
                plasmaCondenser condensement = condenseDocument(nextentry, document);
                if (condensement != null) {
-                    indexDocument(nextentry, document, condensement);
+                    document.notifyWebStructure(webStructure, condensement, nextentry.getModificationDate());
                    storeDocumentIndex(nextentry, document, condensement);
                }
            }
            return true;
@ -2228,8 +2229,58 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
        return condenser;
    }
    private void storeDocumentIndex(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) {
        // CREATE INDEX
        String dc_title = document.dc_title();
        yacyURL referrerURL = entry.referrerURL();
        int processCase = entry.processCase();
        // remove stopwords                        
        log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());
        // STORE URL TO LOADED-URL-DB
        indexURLReference newEntry = null;
        try {
            newEntry = wordIndex.storeDocument(entry, document, condenser);
        } catch (IOException e) {
            log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
            addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, "error storing url: " + e.getMessage(), new kelondroBitfield());
            return;
        }
        // update statistics
        crawlResults.stack(
                newEntry,                      // loaded url db entry
                entry.initiator(),             // initiator peer hash
                yacyCore.seedDB.mySeed().hash, // executor peer hash
                processCase                    // process case
        );
        // STORE WORD INDEX
        if ((!entry.profile().indexText()) && (!entry.profile().indexMedia())) {
            log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
            addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
            return;
        }
        // increment number of indexed urls
        indexedPages++;
        // update profiling info
        plasmaProfiling.updateIndexedPage(entry);
        // if this was performed for a remote crawl request, notify requester
        yacySeed initiatorPeer = entry.initiatorPeer();
        if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
            log.logInfo("Sending crawl receipt for '" + entry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
            if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
            yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
        }
    }
    /*
    private void indexDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) throws InterruptedException {
-        long indexingStartTime = 0, indexingEndTime = 0,
+        long indexingStartTime = System.currentTimeMillis(), indexingEndTime = 0,
        storageStartTime = 0, storageEndTime = 0;
        // CREATE INDEX
@ -2348,6 +2399,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
            yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
        }
    }
    */
    private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy");
    public static String dateString(Date date) {
--- a/source/de/anomic/plasma/plasmaWebStructure.java
+++ b/source/de/anomic/plasma/plasmaWebStructure.java
@ -92,8 +92,8 @@ public class plasmaWebStructure {
        }
    }
-    public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(yacyURL url, String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) {
+    public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(plasmaParserDocument document, plasmaCondenser condenser, Date docDate) {
-        assert url.hash().equals(baseurlhash);
+        yacyURL url = document.dc_source();
        // generate citation reference
        Map<yacyURL, String> hl = document.getHyperlinks();
@ -101,7 +101,7 @@ public class plasmaWebStructure {
        String nexturlhash;
        StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1);
        StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1);
-        String lhp = baseurlhash.substring(6); // local hash part
+        String lhp = url.hash().substring(6); // local hash part
        int GCount = 0;
        int LCount = 0;
        while (it.hasNext()) {
@ -121,7 +121,7 @@ public class plasmaWebStructure {
        // append this reference to buffer
        // generate header info
-        String head = baseurlhash + "=" +
+        String head = url.hash() + "=" +
        plasmaWordIndex.microDateHoursStr(docDate.getTime()) +          // latest update timestamp of the URL
        plasmaWordIndex.microDateHoursStr(System.currentTimeMillis()) + // last visit timestamp of the URL
        kelondroBase64Order.enhancedCoder.encodeLongSmart(LCount, 2) +  // count of links to local resources
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -75,9 +75,11 @@ public final class plasmaWordIndex implements indexRI {
    private final indexRAMRI               dhtOutCache, dhtInCache;
    private final indexCollectionRI        collections;          // new database structure to replace AssortmentCluster and FileCluster
    private       int                      flushsize;
-    private final indexRepositoryReference referenceURL;
+    private       serverLog                log;
    final         indexRepositoryReference referenceURL;
    public plasmaWordIndex(File indexPrimaryRoot, File indexSecondaryRoot, String networkName, serverLog log) {
        this.log = log;
        File indexPrimaryPath = new File(indexPrimaryRoot, networkName);
        File indexPrimaryTextLocation = new File(indexPrimaryPath, "TEXT");
        if (!indexPrimaryTextLocation.exists()) {
@ -603,6 +605,77 @@ public final class plasmaWordIndex implements indexRI {
        return containers; // this may return less containers as demanded
    }
    public indexURLReference storeDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) throws IOException {
        long startTime = System.currentTimeMillis();
        // CREATE INDEX
        String dc_title = document.dc_title();
        yacyURL referrerURL = entry.referrerURL();
        Date docDate = entry.getModificationDate();
        // create a new loaded URL db entry
        long ldate = System.currentTimeMillis();
        indexURLReference newEntry = new indexURLReference(
                entry.url(),                               // URL
                dc_title,                                  // document description
                document.dc_creator(),                     // author
                document.dc_subject(' '),                  // tags
                "",                                        // ETag
                docDate,                                   // modification date
                new Date(),                                // loaded date
                new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula 
                (referrerURL == null) ? null : referrerURL.hash(),            // referer hash
                new byte[0],                               // md5
                (int) entry.size(),                        // size
                condenser.RESULT_NUMB_WORDS,               // word count
                plasmaHTCache.docType(document.dc_format()), // doctype
                condenser.RESULT_FLAGS,                    // flags
                yacyURL.language(entry.url()),             // language
                document.inboundLinks(),                   // inbound links
                document.outboundLinks(),                  // outbound links
                document.getAudiolinks().size(),           // laudio
                document.getImages().size(),               // limage
                document.getVideolinks().size(),           // lvideo
                document.getApplinks().size()              // lapp
        );
        // STORE URL TO LOADED-URL-DB
        putURL(newEntry);
        long storageEndTime = System.currentTimeMillis();
        // STORE PAGE INDEX INTO WORD INDEX DB
        int words = addPageIndex(
                entry.url(),                                  // document url
                docDate,                                      // document mod date
                (int) entry.size(),                           // document size
                document,                                     // document content
                condenser,                                    // document condenser
                yacyURL.language(entry.url()),                // document language
                plasmaHTCache.docType(document.dc_format()),  // document type
                document.inboundLinks(),                      // inbound links
                document.outboundLinks()                      // outbound links
        );
        long indexingEndTime = System.currentTimeMillis();
        if (log.isInfo()) {
            // TODO: UTF-8 docDescription seems not to be displayed correctly because
            // of string concatenation
            log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
                    " [" + entry.urlHash() + "]" +
                    "\n\tDescription:  " + dc_title +
                    "\n\tMimeType: "  + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
                    "Size: " + document.getTextLength() + " bytes | " +
                    "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
                    "\n\tLinkStorageTime: " + (storageEndTime - startTime) + " ms | " +
                    "indexStorageTime: " + (indexingEndTime - storageEndTime) + " ms");
        }
        // finished
        return newEntry;
    }
    public synchronized kelondroCloneableIterator<indexContainer> wordContainers(String startHash, boolean ram, boolean rot) {
        kelondroCloneableIterator<indexContainer> i = wordContainers(startHash, ram);
        if (rot) {