diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index c66845dd2..244dc37d8 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -384,16 +384,11 @@ public final class Switchboard extends serverSwitch // initialize index ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0); - final File oldSingleSegment = new File(new File(indexPath, networkName), "TEXT"); - final File newSegmentsPath = new File(new File(indexPath, networkName), "SEGMENTS"); - Segments.migrateOld( - oldSingleSegment, - newSegmentsPath, - getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default")); + final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS"); this.indexSegments = new Segments( this.log, - newSegmentsPath, + segmentsPath, wordCacheMaxCount, fileSizeMax, this.useTailCache, diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index f675e3c32..3d136084d 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.util.Date; import java.util.Iterator; import java.util.Map; +import java.util.Properties; import java.util.Set; import java.util.TreeSet; @@ -47,8 +48,6 @@ import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReferenceFactory; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.data.navigation.NavigationReference; -import net.yacy.kelondro.data.navigation.NavigationReferenceFactory; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceFactory; @@ -81,12 +80,12 @@ public class Segment { // the reference factory public static final ReferenceFactory wordReferenceFactory = new WordReferenceFactory(); public static final ReferenceFactory citationReferenceFactory = new CitationReferenceFactory(); - public static final ReferenceFactory navigationReferenceFactory = new NavigationReferenceFactory(); + //public static final ReferenceFactory navigationReferenceFactory = new NavigationReferenceFactory(); public static final ByteOrder wordOrder = Base64Order.enhancedCoder; private final Log log; protected final IndexCell termIndex; - //protected final IndexCell urlCitationIndex; + protected final IndexCell urlCitationIndex; //private final IndexCell authorNavIndex; protected final MetadataRepository urlMetadata; private final File segmentPath; @@ -99,9 +98,6 @@ public class Segment { final boolean useTailCache, final boolean exceed134217727) throws IOException { - migrateTextIndex(segmentPath, segmentPath); - migrateTextMetadata(segmentPath, segmentPath); - log.logInfo("Initializing Segment '" + segmentPath + "."); this.log = log; @@ -117,10 +113,10 @@ public class Segment { targetFileSize, maxFileSize, writeBufferSize); -/* + this.urlCitationIndex = new IndexCell( segmentPath, - "urlcitation.index", + "citation.index", citationReferenceFactory, wordOrder, Word.commonHashLength, @@ -128,7 +124,7 @@ public class Segment { targetFileSize, maxFileSize, writeBufferSize); -*/ + /* this.authorNavIndex = new IndexCell( new File(new File(segmentPath, "nav_author"), "idx"), @@ -154,32 +150,6 @@ public class Segment { return this.urlMetadata.getSolr(); } - public static void migrateTextIndex(final File oldSegmentPath, final File newSegmentPath) { - final File oldCellPath = new File(oldSegmentPath, "RICELL"); - if (!oldCellPath.exists()) return; - final String[] oldIndexFiles = oldCellPath.list(); - for (final String oldIndexFile: oldIndexFiles) { - if (oldIndexFile.startsWith("index.")) { - final File newFile = new File(newSegmentPath, "text.index." + oldIndexFile.substring(6)); - new File(oldCellPath, oldIndexFile).renameTo(newFile); - } - } - oldCellPath.delete(); - } - - public static void migrateTextMetadata(final File oldSegmentPath, final File newSegmentPath) { - final File oldMetadataPath = new File(oldSegmentPath, "METADATA"); - if (!oldMetadataPath.exists()) return; - final String[] oldMetadataFiles = oldMetadataPath.list(); - for (final String oldMetadataFile: oldMetadataFiles) { - if (oldMetadataFile.startsWith("urls.")) { - final File newFile = new File(newSegmentPath, "text.urlmd." + oldMetadataFile.substring(5)); - new File(oldMetadataPath, oldMetadataFile).renameTo(newFile); - } - } - oldMetadataPath.delete(); - } - public MetadataRepository urlMetadata() { return this.urlMetadata; } @@ -188,6 +158,10 @@ public class Segment { return this.termIndex; } + public IndexCell urlCitation() { + return this.urlCitationIndex; + } + public boolean exists(final byte[] urlhash) { return this.urlMetadata.exists(urlhash); } @@ -196,6 +170,7 @@ public class Segment { try { this.termIndex.clear(); this.urlMetadata.clear(); + this.urlCitationIndex.clear(); } catch (final IOException e) { Log.logException(e); } @@ -238,7 +213,7 @@ public class Segment { final int urlLength = url.toNormalform(true, true).length(); final int urlComps = MultiProtocolURI.urlComps(url.toString()).length; - // iterate over all words of context text + // iterate over all words of content text final Iterator> i = condenser.words().entrySet().iterator(); Map.Entry wentry; String word; @@ -267,6 +242,11 @@ public class Segment { Log.logException(e); } wordCount++; + + // during a search event it is possible that a heuristic is used which aquires index + // data during search-time. To transfer indexed data directly to the search process + // the following lines push the index data additionally to the search process + // this is done only for searched words if (searchEvent != null && !searchEvent.getQuery().excludeHashes.has(wordhash) && searchEvent.getQuery().queryHashes.has(wordhash)) { // if the page was added in the context of a heuristic this shall ensure that findings will fire directly into the search result ReferenceContainer container; @@ -282,9 +262,31 @@ public class Segment { return wordCount; } + private int addCitationIndex(final DigestURI url, final Date urlModified, final Document document) { + if (document.getAnchors() == null) return 0; + int refCount = 0; + + // iterate over all outgoing links, this will create a context for those links + final byte[] urlhash = url.hash(); + final long urldate = urlModified.getTime(); + for (Map.Entry anchorEntry: document.getAnchors().entrySet()) { + MultiProtocolURI anchor = anchorEntry.getKey(); + byte[] refhash = new DigestURI(anchor).hash(); + //System.out.println("*** addCitationIndex: urlhash = " + ASCII.String(urlhash) + ", refhash = " + ASCII.String(refhash) + ", urldate = " + urlModified.toString()); + try { + this.urlCitationIndex.add(urlhash, new CitationReference(refhash, urldate)); + } catch (final Exception e) { + Log.logException(e); + } + refCount++; + } + return refCount; + } + public void close() { this.termIndex.close(); this.urlMetadata.close(); + this.urlCitationIndex.close(); } public URIMetadataRow storeDocument( @@ -392,6 +394,11 @@ public class Segment { searchEvent, // a search event that can have results directly sourceName // the name of the source where the index was created ); + + // STORE PAGE REFERENCES INTO CITATION INDEX + final int refs = addCitationIndex(url, modDate, document); + + // finish index time final long indexingEndTime = System.currentTimeMillis(); if (this.log.isInfo()) { @@ -402,7 +409,7 @@ public class Segment { "\n\tDescription: " + dc_title + "\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " + "Size: " + document.getTextLength() + " bytes | " + - "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) + + "Anchors: " + refs + "\n\tLinkStorageTime: " + (storageEndTime - startTime) + " ms | " + "indexStorageTime: " + (indexingEndTime - storageEndTime) + " ms"); } diff --git a/source/net/yacy/search/index/Segments.java b/source/net/yacy/search/index/Segments.java index 1adb0bfa5..2f7e3fb57 100644 --- a/source/net/yacy/search/index/Segments.java +++ b/source/net/yacy/search/index/Segments.java @@ -101,21 +101,6 @@ public class Segments implements Iterable { this.process_assignment.put(process, segmentName); } - public static void migrateOld(final File oldSingleSegment, final File newSegmentsPath, final String newSegmentName) { - if (!oldSingleSegment.exists()) return; - final File newSegmentPath = new File(newSegmentsPath, newSegmentName); - if (!newSegmentPath.exists()) newSegmentPath.mkdirs(); - Segment.migrateTextIndex(oldSingleSegment, newSegmentPath); - Segment.migrateTextMetadata(oldSingleSegment, newSegmentPath); - - final String[] oldFiles = oldSingleSegment.list(); - for (final String oldFile: oldFiles) { - if (oldFile.startsWith("text.")) { - new File(oldSingleSegment, oldFile).renameTo(new File(newSegmentPath, oldFile)); - } - } - } - public String[] segmentNames() { return this.segments.keySet().toArray(new String[this.segments.size()]); }