added storage of full anchor link structure:

the links between all pages are now stored. The same index structure as used for the word index is used to make a reverse link index. The new file(s) in SEGMENT/default/citation.index.*.blob store the citation index. This will be used to create much more detailed link structures for the YaCy apis and to create a better ranking. A ranking using the citation.index should provide better results especially for portal indexes and initranets.
13 years ago · 8fc86fe397
parent 22f05c83ff
commit 8fc86fe397
3 changed files with 47 additions and 60 deletions
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -384,16 +384,11 @@ public final class Switchboard extends serverSwitch

        // initialize index
        ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
-        final File oldSingleSegment = new File(new File(indexPath, networkName), "TEXT");
-        final File newSegmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
-        Segments.migrateOld(
-            oldSingleSegment,
-            newSegmentsPath,
-            getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"));
+        final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
        this.indexSegments =
            new Segments(
                this.log,
-                newSegmentsPath,
+                segmentsPath,
                wordCacheMaxCount,
                fileSizeMax,
                this.useTailCache,
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -31,6 +31,7 @@ import java.io.IOException;
 import java.util.Date;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Properties;
 import java.util.Set;
 import java.util.TreeSet;

@ -47,8 +48,6 @@ import net.yacy.kelondro.data.citation.CitationReference;
 import net.yacy.kelondro.data.citation.CitationReferenceFactory;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
-import net.yacy.kelondro.data.navigation.NavigationReference;
-import net.yacy.kelondro.data.navigation.NavigationReferenceFactory;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.data.word.WordReference;
 import net.yacy.kelondro.data.word.WordReferenceFactory;
@ -81,12 +80,12 @@ public class Segment {
    // the reference factory
    public static final ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory();
    public static final ReferenceFactory<CitationReference> citationReferenceFactory = new CitationReferenceFactory();
-    public static final ReferenceFactory<NavigationReference> navigationReferenceFactory = new NavigationReferenceFactory();
+    //public static final ReferenceFactory<NavigationReference> navigationReferenceFactory = new NavigationReferenceFactory();
    public static final ByteOrder wordOrder = Base64Order.enhancedCoder;

    private   final Log                            log;
    protected final IndexCell<WordReference>       termIndex;
-    //protected final IndexCell<CitationReference>   urlCitationIndex;
+    protected final IndexCell<CitationReference>   urlCitationIndex;
    //private   final IndexCell<NavigationReference> authorNavIndex;
    protected final MetadataRepository             urlMetadata;
    private   final File                           segmentPath;
@ -99,9 +98,6 @@ public class Segment {
            final boolean useTailCache,
            final boolean exceed134217727) throws IOException {

-        migrateTextIndex(segmentPath, segmentPath);
-        migrateTextMetadata(segmentPath, segmentPath);
-
        log.logInfo("Initializing Segment '" + segmentPath + ".");

        this.log = log;
@ -117,10 +113,10 @@ public class Segment {
                targetFileSize,
                maxFileSize,
                writeBufferSize);
-/*
+
        this.urlCitationIndex = new IndexCell<CitationReference>(
                segmentPath,
-                "urlcitation.index",
+                "citation.index",
                citationReferenceFactory,
                wordOrder,
                Word.commonHashLength,
@ -128,7 +124,7 @@ public class Segment {
                targetFileSize,
                maxFileSize,
                writeBufferSize);
-*/
+
        /*
        this.authorNavIndex = new IndexCell<NavigationReference>(
                new File(new File(segmentPath, "nav_author"), "idx"),
@ -154,32 +150,6 @@ public class Segment {
        return this.urlMetadata.getSolr();
    }

-    public static void migrateTextIndex(final File oldSegmentPath, final File newSegmentPath) {
-        final File oldCellPath = new File(oldSegmentPath, "RICELL");
-        if (!oldCellPath.exists()) return;
-        final String[] oldIndexFiles = oldCellPath.list();
-        for (final String oldIndexFile: oldIndexFiles) {
-            if (oldIndexFile.startsWith("index.")) {
-                final File newFile = new File(newSegmentPath, "text.index." + oldIndexFile.substring(6));
-                new File(oldCellPath, oldIndexFile).renameTo(newFile);
-            }
-        }
-        oldCellPath.delete();
-    }
-
-    public static void migrateTextMetadata(final File oldSegmentPath, final File newSegmentPath) {
-        final File oldMetadataPath = new File(oldSegmentPath, "METADATA");
-        if (!oldMetadataPath.exists()) return;
-        final String[] oldMetadataFiles = oldMetadataPath.list();
-        for (final String oldMetadataFile: oldMetadataFiles) {
-            if (oldMetadataFile.startsWith("urls.")) {
-                final File newFile = new File(newSegmentPath, "text.urlmd." + oldMetadataFile.substring(5));
-                new File(oldMetadataPath, oldMetadataFile).renameTo(newFile);
-            }
-        }
-        oldMetadataPath.delete();
-    }
-
    public MetadataRepository urlMetadata() {
        return this.urlMetadata;
    }
@ -188,6 +158,10 @@ public class Segment {
        return this.termIndex;
    }

+    public IndexCell<CitationReference> urlCitation() {
+        return this.urlCitationIndex;
+    }
+
    public boolean exists(final byte[] urlhash) {
        return this.urlMetadata.exists(urlhash);
    }
@ -196,6 +170,7 @@ public class Segment {
        try {
            this.termIndex.clear();
            this.urlMetadata.clear();
+            this.urlCitationIndex.clear();
        } catch (final IOException e) {
            Log.logException(e);
        }
@ -238,7 +213,7 @@ public class Segment {
        final int urlLength = url.toNormalform(true, true).length();
        final int urlComps = MultiProtocolURI.urlComps(url.toString()).length;

-        // iterate over all words of context text
+        // iterate over all words of content text
        final Iterator<Map.Entry<String, Word>> i = condenser.words().entrySet().iterator();
        Map.Entry<String, Word> wentry;
        String word;
@ -267,6 +242,11 @@ public class Segment {
                Log.logException(e);
            }
            wordCount++;
+            
+            // during a search event it is possible that a heuristic is used which aquires index
+            // data during search-time. To transfer indexed data directly to the search process
+            // the following lines push the index data additionally to the search process
+            // this is done only for searched words
            if (searchEvent != null && !searchEvent.getQuery().excludeHashes.has(wordhash) && searchEvent.getQuery().queryHashes.has(wordhash)) {
                // if the page was added in the context of a heuristic this shall ensure that findings will fire directly into the search result
                ReferenceContainer<WordReference> container;
@ -282,9 +262,31 @@ public class Segment {
        return wordCount;
    }

+    private int addCitationIndex(final DigestURI url, final Date urlModified, final Document document) {
+    	if (document.getAnchors() == null) return 0;
+    	int refCount = 0;
+
+        // iterate over all outgoing links, this will create a context for those links
+        final byte[] urlhash = url.hash();
+        final long urldate = urlModified.getTime();
+        for (Map.Entry<MultiProtocolURI, Properties> anchorEntry: document.getAnchors().entrySet()) {
+        	MultiProtocolURI anchor = anchorEntry.getKey();
+        	byte[] refhash = new DigestURI(anchor).hash();
+        	//System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString());
+            try {
+                this.urlCitationIndex.add(urlhash, new CitationReference(refhash, urldate));
+            } catch (final Exception e) {
+                Log.logException(e);
+            }
+            refCount++;
+        }
+        return refCount;
+    }
+    
    public void close() {
        this.termIndex.close();
        this.urlMetadata.close();
+        this.urlCitationIndex.close();
    }

    public URIMetadataRow storeDocument(
@ -392,6 +394,11 @@ public class Segment {
                searchEvent,                                  // a search event that can have results directly
                sourceName                                    // the name of the source where the index was created
        );
+        
+        // STORE PAGE REFERENCES INTO CITATION INDEX
+        final int refs = addCitationIndex(url, modDate, document);
+        
+        // finish index time
        final long indexingEndTime = System.currentTimeMillis();

        if (this.log.isInfo()) {
@ -402,7 +409,7 @@ public class Segment {
                    "\n\tDescription:  " + dc_title +
                    "\n\tMimeType: "  + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
                    "Size: " + document.getTextLength() + " bytes | " +
-                    "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
+                    "Anchors: " + refs +
                    "\n\tLinkStorageTime: " + (storageEndTime - startTime) + " ms | " +
                    "indexStorageTime: " + (indexingEndTime - storageEndTime) + " ms");
        }
--- a/source/net/yacy/search/index/Segments.java
+++ b/source/net/yacy/search/index/Segments.java
@ -101,21 +101,6 @@ public class Segments implements Iterable<Segment> {
        this.process_assignment.put(process, segmentName);
    }

-    public static void migrateOld(final File oldSingleSegment, final File newSegmentsPath, final String newSegmentName) {
-        if (!oldSingleSegment.exists()) return;
-        final File newSegmentPath = new File(newSegmentsPath, newSegmentName);
-        if (!newSegmentPath.exists()) newSegmentPath.mkdirs();
-        Segment.migrateTextIndex(oldSingleSegment, newSegmentPath);
-        Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath);
-
-        final String[] oldFiles = oldSingleSegment.list();
-        for (final String oldFile: oldFiles) {
-            if (oldFile.startsWith("text.")) {
-                new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile));
-            }
-        }
-    }
-
    public String[] segmentNames() {
        return this.segments.keySet().toArray(new String[this.segments.size()]);
    }