- more abstraction for the RWI index as preparation for solr integration

- added options in search index to switch parts of the index on or off
pull/1/head
orbiter 13 years ago
parent 6cc5d1094e
commit 69e743d9e3

@ -46,7 +46,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.peers.NewsPool; import net.yacy.peers.NewsPool;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
@ -194,7 +194,7 @@ public class Bookmarks {
final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash); final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) { if (bookmark == null) {
// try to get the bookmark from the LURL database // try to get the bookmark from the LURL database
final URIMetadataRow urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash)); final URIMetadata urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash));
if (urlentry != null) try { if (urlentry != null) try {
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null)); final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null));
prop.put("mode_edit", "0"); // create mode prop.put("mode_edit", "0"); // create mode

@ -35,7 +35,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.peers.Seed; import net.yacy.peers.Seed;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
@ -178,7 +178,7 @@ public class CrawlResults {
boolean dark = true; boolean dark = true;
String urlstr, urltxt; String urlstr, urltxt;
Seed initiatorSeed, executorSeed; Seed initiatorSeed, executorSeed;
URIMetadataRow urle; URIMetadata urle;
int cnt = 0; int cnt = 0;
final Iterator<Map.Entry<String, InitExecEntry>> i = ResultURLs.results(tabletype); final Iterator<Map.Entry<String, InitExecEntry>> i = ResultURLs.results(tabletype);

@ -33,10 +33,10 @@
<dt class="TableCellDark">Index Deletion</dt> <dt class="TableCellDark">Index Deletion</dt>
<dd><input type="checkbox" name="deleteIndex" id="deleteIndex" <dd><input type="checkbox" name="deleteIndex" id="deleteIndex"
onclick="x=document.getElementById('deleteIndex').checked;document.getElementById('deleteTriplestore').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deleteTriplestore').disabled=c;document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;" onclick="x=document.getElementById('deleteIndex').checked;document.getElementById('deleteTriplestore').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deleteTriplestore').disabled=c;document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
/><label for="deleteIndex">Delete Search Index</label><br/> /><label for="deleteIndex">Delete local search index (including local solr)</label><br/>
#(solr)#::<input type="checkbox" name="deleteSolr" id="deleteSolr" #(solr)#::<input type="checkbox" name="deleteRemoteSolr" id="deleteRemoteSolr"
onclick="x=document.getElementById('deleteSolr').checked;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;" onclick="x=document.getElementById('deleteRemoteSolr').checked;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
/><label for="deleteSolr">Delete Solr Index</label><br/>#(/solr)# /><label for="deleteRemoteSolr">Delete remote solr index</label><br/>#(/solr)#
<input type="checkbox" name="deleteTriplestore" id="deleteTriplestore" disabled="disabled" /><label for="deleteTriplestore">Delete RDF Triplestore</label><br/> <input type="checkbox" name="deleteTriplestore" id="deleteTriplestore" disabled="disabled" /><label for="deleteTriplestore">Delete RDF Triplestore</label><br/>
<input type="checkbox" name="deleteCache" id="deleteCache" disabled="disabled" /><label for="deleteCache">Delete HTTP &amp; FTP Cache</label><br/> <input type="checkbox" name="deleteCache" id="deleteCache" disabled="disabled" /><label for="deleteCache">Delete HTTP &amp; FTP Cache</label><br/>
<input type="checkbox" name="deleteCrawlQueues" id="deleteCrawlQueues" disabled="disabled" /><label for="deleteCrawlQueues">Stop Crawler and delete Crawl Queues</label><br/> <input type="checkbox" name="deleteCrawlQueues" id="deleteCrawlQueues" disabled="disabled" /><label for="deleteCrawlQueues">Stop Crawler and delete Crawl Queues</label><br/>

@ -42,6 +42,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
@ -156,7 +157,7 @@ public class IndexControlRWIs_p {
if ( post.get("deleteIndex", "").equals("on") ) { if ( post.get("deleteIndex", "").equals("on") ) {
segment.clear(); segment.clear();
} }
if ( post.get("deleteSolr", "").equals("on") && sb.index.getRemoteSolr() != null) { if ( post.get("deleteRemoteSolr", "").equals("on") && sb.index.getRemoteSolr() != null) {
try { try {
sb.index.getRemoteSolr().clear(); sb.index.getRemoteSolr().clear();
} catch ( final Exception e ) { } catch ( final Exception e ) {
@ -307,15 +308,15 @@ public class IndexControlRWIs_p {
index = segment.termIndex().get(keyhash, null); index = segment.termIndex().get(keyhash, null);
// built urlCache // built urlCache
final Iterator<WordReference> urlIter = index.entries(); final Iterator<WordReference> urlIter = index.entries();
final TreeMap<byte[], URIMetadataRow> knownURLs = final TreeMap<byte[], URIMetadata> knownURLs =
new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder); new TreeMap<byte[], URIMetadata>(Base64Order.enhancedCoder);
final HandleSet unknownURLEntries = final HandleSet unknownURLEntries =
new HandleSet( new HandleSet(
WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.primaryKeyLength,
WordReferenceRow.urlEntryRow.objectOrder, WordReferenceRow.urlEntryRow.objectOrder,
index.size()); index.size());
Reference iEntry; Reference iEntry;
URIMetadataRow lurl; URIMetadata lurl;
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
iEntry = urlIter.next(); iEntry = urlIter.next();
lurl = segment.urlMetadata().load(iEntry.urlhash()); lurl = segment.urlMetadata().load(iEntry.urlhash());
@ -413,7 +414,7 @@ public class IndexControlRWIs_p {
} catch ( final RowSpaceExceededException e ) { } catch ( final RowSpaceExceededException e ) {
Log.logException(e); Log.logException(e);
} }
final URIMetadataRow e = segment.urlMetadata().load(b); final URIMetadata e = segment.urlMetadata().load(b);
segment.urlMetadata().remove(b); segment.urlMetadata().remove(b);
if ( e != null ) { if ( e != null ) {
url = e.url(); url = e.url();
@ -448,7 +449,7 @@ public class IndexControlRWIs_p {
} catch ( final RowSpaceExceededException e ) { } catch ( final RowSpaceExceededException e ) {
Log.logException(e); Log.logException(e);
} }
final URIMetadataRow e = segment.urlMetadata().load(b); final URIMetadata e = segment.urlMetadata().load(b);
segment.urlMetadata().remove(b); segment.urlMetadata().remove(b);
if ( e != null ) { if ( e != null ) {
url = e.url(); url = e.url();
@ -514,7 +515,7 @@ public class IndexControlRWIs_p {
prop.put("genUrlList_lines", maxlines); prop.put("genUrlList_lines", maxlines);
int i = 0; int i = 0;
DigestURI url; DigestURI url;
URIMetadataRow entry; URIMetadata entry;
String us; String us;
long rn = -1; long rn = -1;
while ( !ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null ) { while ( !ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null ) {

@ -35,7 +35,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
@ -132,7 +132,7 @@ public class IndexControlURLs_p {
} }
if (post.containsKey("urlhashdelete")) { if (post.containsKey("urlhashdelete")) {
final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash)); final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
if (entry == null) { if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else { } else {
@ -166,7 +166,7 @@ public class IndexControlURLs_p {
final DigestURI url = new DigestURI(urlstring); final DigestURI url = new DigestURI(urlstring);
urlhash = ASCII.String(url.hash()); urlhash = ASCII.String(url.hash());
prop.put("urlhash", urlhash); prop.put("urlhash", urlhash);
final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash)); final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
if (entry == null) { if (entry == null) {
prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true)); prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true));
prop.putHTML("urlstring", urlstring); prop.putHTML("urlstring", urlstring);
@ -184,7 +184,7 @@ public class IndexControlURLs_p {
} }
if (post.containsKey("urlhashsearch")) { if (post.containsKey("urlhashsearch")) {
final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash)); final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
if (entry == null) { if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash); prop.putHTML("result", "No Entry for URL hash " + urlhash);
} else { } else {
@ -199,9 +199,9 @@ public class IndexControlURLs_p {
// generate list // generate list
if (post.containsKey("urlhashsimilar")) { if (post.containsKey("urlhashsimilar")) {
try { try {
final Iterator<URIMetadataRow> entryIt = new RotateIterator<URIMetadataRow>(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax()); final Iterator<URIMetadata> entryIt = new RotateIterator<URIMetadata>(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />"); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
URIMetadataRow entry; URIMetadata entry;
int i = 0, rows = 0, cols = 0; int i = 0, rows = 0, cols = 0;
prop.put("urlhashsimilar", "1"); prop.put("urlhashsimilar", "1");
while (entryIt.hasNext() && i < 256) { while (entryIt.hasNext() && i < 256) {
@ -303,14 +303,14 @@ public class IndexControlURLs_p {
return prop; return prop;
} }
private static serverObjects genUrlProfile(final Segment segment, final URIMetadataRow entry, final String urlhash) { private static serverObjects genUrlProfile(final Segment segment, final URIMetadata entry, final String urlhash) {
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
if (entry == null) { if (entry == null) {
prop.put("genUrlProfile", "1"); prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash); prop.put("genUrlProfile_urlhash", urlhash);
return prop; return prop;
} }
final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash()); final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
if (entry.url() == null) { if (entry.url() == null) {
prop.put("genUrlProfile", "1"); prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash); prop.put("genUrlProfile_urlhash", urlhash);

@ -51,7 +51,7 @@ import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
@ -112,7 +112,7 @@ public class ViewFile {
// get the url hash from which the content should be loaded // get the url hash from which the content should be loaded
String urlHash = post.get("urlHash", ""); String urlHash = post.get("urlHash", "");
URIMetadataRow urlEntry = null; URIMetadata urlEntry = null;
// get the urlEntry that belongs to the url hash // get the urlEntry that belongs to the url hash
if (urlHash.length() > 0 && (urlEntry = indexSegment.urlMetadata().load(ASCII.getBytes(urlHash))) != null) { if (urlHash.length() > 0 && (urlEntry = indexSegment.urlMetadata().load(ASCII.getBytes(urlHash))) != null) {
// get the url that belongs to the entry // get the url that belongs to the entry

@ -35,7 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
@ -86,12 +86,12 @@ public class Vocabulary_p {
if (p >= 0) t = t.substring(p + 1); if (p >= 0) t = t.substring(p + 1);
} }
if (discoverFromTitle || discoverFromTitleSplitted) { if (discoverFromTitle || discoverFromTitleSplitted) {
URIMetadataRow m = segment.urlMetadata().load(u.hash()); URIMetadata m = segment.urlMetadata().load(u.hash());
if (m != null) t = m.dc_title(); if (m != null) t = m.dc_title();
if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
} }
if (discoverFromAuthor) { if (discoverFromAuthor) {
URIMetadataRow m = segment.urlMetadata().load(u.hash()); URIMetadata m = segment.urlMetadata().load(u.hash());
if (m != null) t = m.dc_creator(); if (m != null) t = m.dc_creator();
} }
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();

@ -35,7 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.RequestHeader.FileType; import net.yacy.cora.protocol.RequestHeader.FileType;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
@ -97,13 +97,13 @@ public class yacydoc {
} }
if (urlhash == null || urlhash.isEmpty()) return prop; if (urlhash == null || urlhash.isEmpty()) return prop;
final URIMetadataRow entry = segment.urlMetadata().load(urlhash.getBytes()); final URIMetadata entry = segment.urlMetadata().load(urlhash.getBytes());
if (entry == null) return prop; if (entry == null) return prop;
if (entry.url() == null) { if (entry.url() == null) {
return prop; return prop;
} }
final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash()); final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
prop.putXML("dc_title", entry.dc_title()); prop.putXML("dc_title", entry.dc_title());
prop.putXML("dc_creator", entry.dc_creator()); prop.putXML("dc_creator", entry.dc_creator());

@ -31,7 +31,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.peers.Protocol; import net.yacy.peers.Protocol;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import de.anomic.crawler.NoticedURL; import de.anomic.crawler.NoticedURL;
@ -110,7 +110,7 @@ public class urls {
if (urlhashes.length() % 12 != 0) return prop; if (urlhashes.length() % 12 != 0) return prop;
final int count = urlhashes.length() / 12; final int count = urlhashes.length() / 12;
int c = 0; int c = 0;
URIMetadataRow entry; URIMetadata entry;
DigestURI referrer; DigestURI referrer;
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
entry = sb.index.urlMetadata().load(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1)))); entry = sb.index.urlMetadata().load(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));

@ -57,7 +57,7 @@ import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.geolocation.GeoLocation; import net.yacy.document.geolocation.GeoLocation;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -660,7 +660,7 @@ public class yacysearch {
return prop; return prop;
} }
final String recommendHash = post.get("recommendref", ""); // urlhash final String recommendHash = post.get("recommendref", ""); // urlhash
final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash)); final URIMetadata urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash));
if ( urlentry != null ) { if ( urlentry != null ) {
Document[] documents = null; Document[] documents = null;
try { try {
@ -696,7 +696,7 @@ public class yacysearch {
return prop; return prop;
} }
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(bookmarkHash)); final URIMetadata urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(bookmarkHash));
if ( urlentry != null ) { if ( urlentry != null ) {
try { try {
sb.tables.bookmarks.createBookmark( sb.tables.bookmarks.createBookmark(

@ -45,7 +45,7 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.kelondro.workflow.WorkflowProcessor;
@ -439,7 +439,7 @@ public final class CrawlStacker {
// check if the url is double registered // check if the url is double registered
final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
final URIMetadataRow oldEntry = this.indexSegment.urlMetadata().load(url.hash()); final URIMetadata oldEntry = this.indexSegment.urlMetadata().load(url.hash());
if (oldEntry == null) { if (oldEntry == null) {
if (dbocc != null) { if (dbocc != null) {
// do double-check // do double-check

@ -32,7 +32,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.document.parser.sitemapParser; import net.yacy.document.parser.sitemapParser;
import net.yacy.document.parser.sitemapParser.URLEntry; import net.yacy.document.parser.sitemapParser.URLEntry;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Request;
@ -84,7 +84,7 @@ public class SitemapImporter extends Thread {
final String dbocc = this.sb.urlExists(nexturlhash); final String dbocc = this.sb.urlExists(nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) { if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date // the url was already loaded. we need to check the date
final URIMetadataRow oldEntry = this.sb.index.urlMetadata().load(nexturlhash); final URIMetadata oldEntry = this.sb.index.urlMetadata().load(nexturlhash);
if (oldEntry != null) { if (oldEntry != null) {
final Date modDate = oldEntry.moddate(); final Date modDate = oldEntry.moddate();
// check if modDate is null // check if modDate is null

@ -419,7 +419,8 @@ public class URLAnalysis {
public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException { public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException {
System.out.println("INDEX DIFF URL-COL startup"); System.out.println("INDEX DIFF URL-COL startup");
final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile)); final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile));
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false); final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
mr.connectUrlDb(Segment.UrlDbName, false, false);
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000); final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000);
System.out.println("INDEX DIFF URL-COL loaded dump, starting diff"); System.out.println("INDEX DIFF URL-COL loaded dump, starting diff");
final long start = System.currentTimeMillis(); final long start = System.currentTimeMillis();
@ -447,7 +448,8 @@ public class URLAnalysis {
public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException { public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException {
// format: 0=text, 1=html, 2=rss/xml // format: 0=text, 1=html, 2=rss/xml
System.out.println("URL EXPORT startup"); System.out.println("URL EXPORT startup");
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false); final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
mr.connectUrlDb(Segment.UrlDbName, false, false);
final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile)); final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
System.out.println("URL EXPORT loaded dump, starting export"); System.out.println("URL EXPORT loaded dump, starting export");
final Export e = mr.export(new File(export), ".*", hs, format, false); final Export e = mr.export(new File(export), ".*", hs, format, false);
@ -461,7 +463,8 @@ public class URLAnalysis {
public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException { public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException {
System.out.println("URL DELETE startup"); System.out.println("URL DELETE startup");
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false); final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
mr.connectUrlDb(Segment.UrlDbName, false, false);
final int mrSize = mr.size(); final int mrSize = mr.size();
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile)); final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize); System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);

@ -36,7 +36,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser.Failure; import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
@ -105,7 +105,7 @@ public class YMarkMetadata {
public EnumMap<METADATA, String> getMetadata() { public EnumMap<METADATA, String> getMetadata() {
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class); final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
final URIMetadataRow urlEntry = this.indexSegment.urlMetadata().load(this.uri.hash()); final URIMetadata urlEntry = this.indexSegment.urlMetadata().load(this.uri.hash());
if (urlEntry != null) { if (urlEntry != null) {
metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size())); metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size()));
metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate())); metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate()));

@ -34,6 +34,7 @@ import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest; import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
@ -89,7 +90,7 @@ public class AbstractSolrConnector implements SolrConnector {
@Override @Override
public long getSize() { public long getSize() {
try { try {
final SolrDocumentList list = get("*:*", 0, 1); final SolrDocumentList list = query("*:*", 0, 1);
return list.getNumFound(); return list.getNumFound();
} catch (final Throwable e) { } catch (final Throwable e) {
Log.logException(e); Log.logException(e);
@ -132,8 +133,8 @@ public class AbstractSolrConnector implements SolrConnector {
@Override @Override
public boolean exists(final String id) throws IOException { public boolean exists(final String id) throws IOException {
try { try {
final SolrDocumentList list = get(SolrField.id.getSolrFieldName() + ":" + id, 0, 1); final SolrDocument doc = get(id);
return list.getNumFound() > 0; return doc != null;
} catch (final Throwable e) { } catch (final Throwable e) {
Log.logException(e); Log.logException(e);
return false; return false;
@ -186,7 +187,7 @@ public class AbstractSolrConnector implements SolrConnector {
* @throws IOException * @throws IOException
*/ */
@Override @Override
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException { public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
// construct query // construct query
final SolrQuery query = new SolrQuery(); final SolrQuery query = new SolrQuery();
query.setQuery(querystring); query.setQuery(querystring);
@ -209,8 +210,33 @@ public class AbstractSolrConnector implements SolrConnector {
} catch (final Throwable e) { } catch (final Throwable e) {
throw new IOException(e); throw new IOException(e);
} }
}
/**
* get a document from solr by given id
* @param id
* @return one result or null if no result exists
* @throws IOException
*/
@Override
public SolrDocument get(final String id) throws IOException {
// construct query
StringBuffer sb = new StringBuffer(id.length() + 3);
sb.append(SolrField.id.getSolrFieldName()).append(':').append(id);
final SolrQuery query = new SolrQuery();
query.setQuery(sb.toString());
query.setRows(1);
query.setStart(0);
//return result; // query the server
try {
final QueryResponse rsp = this.server.query( query );
final SolrDocumentList docs = rsp.getResults();
if (docs.isEmpty()) return null;
return docs.get(0);
} catch (final Throwable e) {
throw new IOException(e);
}
} }
} }

@ -5,6 +5,7 @@ import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ArrayBlockingQueue;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
@ -111,6 +112,11 @@ public class MultipleSolrConnector implements SolrConnector {
return this.solr.exists(id); return this.solr.exists(id);
} }
@Override
public SolrDocument get(String id) throws IOException {
return this.solr.get(id);
}
@Override @Override
public void add(final SolrDoc solrdoc) throws IOException, SolrException { public void add(final SolrDoc solrdoc) throws IOException, SolrException {
try { try {
@ -132,8 +138,8 @@ public class MultipleSolrConnector implements SolrConnector {
} }
@Override @Override
public SolrDocumentList get(String querystring, int offset, int count) throws IOException { public SolrDocumentList query(String querystring, int offset, int count) throws IOException {
return this.solr.get(querystring, offset, count); return this.solr.query(querystring, offset, count);
} }
@Override @Override

@ -28,6 +28,7 @@ import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
@ -120,6 +121,21 @@ public class RetrySolrConnector implements SolrConnector {
return false; return false;
} }
@Override
public SolrDocument get(String id) throws IOException {
final long t = System.currentTimeMillis() + this.retryMaxTime;
Throwable ee = null;
while (System.currentTimeMillis() < t) try {
return this.solrConnector.get(id);
} catch (final Throwable e) {
ee = e;
try {Thread.sleep(10);} catch (final InterruptedException e1) {}
continue;
}
if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
return null;
}
@Override @Override
public void add(final SolrDoc solrdoc) throws IOException, SolrException { public void add(final SolrDoc solrdoc) throws IOException, SolrException {
final long t = System.currentTimeMillis() + this.retryMaxTime; final long t = System.currentTimeMillis() + this.retryMaxTime;
@ -141,11 +157,11 @@ public class RetrySolrConnector implements SolrConnector {
} }
@Override @Override
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException { public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
final long t = System.currentTimeMillis() + this.retryMaxTime; final long t = System.currentTimeMillis() + this.retryMaxTime;
Throwable ee = null; Throwable ee = null;
while (System.currentTimeMillis() < t) try { while (System.currentTimeMillis() < t) try {
return this.solrConnector.get(querystring, offset, count); return this.solrConnector.query(querystring, offset, count);
} catch (final Throwable e) { } catch (final Throwable e) {
ee = e; ee = e;
try {Thread.sleep(10);} catch (final InterruptedException e1) {} try {Thread.sleep(10);} catch (final InterruptedException e1) {}

@ -116,6 +116,15 @@ public class ShardSolrConnector implements SolrConnector {
} }
return false; return false;
} }
@Override
public SolrDocument get(String id) throws IOException {
for (final SolrConnector connector: this.connectors) {
SolrDocument doc = connector.get(id);
if (doc != null) return doc;
}
return null;
}
/** /**
* add a Solr document * add a Solr document
@ -148,10 +157,10 @@ public class ShardSolrConnector implements SolrConnector {
* @throws IOException * @throws IOException
*/ */
@Override @Override
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException { public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
final SolrDocumentList list = new SolrDocumentList(); final SolrDocumentList list = new SolrDocumentList();
for (final SolrConnector connector: this.connectors) { for (final SolrConnector connector: this.connectors) {
final SolrDocumentList l = connector.get(querystring, offset, count); final SolrDocumentList l = connector.query(querystring, offset, count);
for (final SolrDocument d: l) { for (final SolrDocument d: l) {
list.add(d); list.add(d);
} }
@ -163,7 +172,7 @@ public class ShardSolrConnector implements SolrConnector {
final SolrDocumentList[] list = new SolrDocumentList[this.connectors.size()]; final SolrDocumentList[] list = new SolrDocumentList[this.connectors.size()];
int i = 0; int i = 0;
for (final SolrConnector connector: this.connectors) { for (final SolrConnector connector: this.connectors) {
list[i++] = connector.get(querystring, offset, count); list[i++] = connector.query(querystring, offset, count);
} }
return list; return list;
} }

@ -28,6 +28,7 @@ import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
@ -87,13 +88,21 @@ public interface SolrConnector {
public void add(final SolrDoc solrdoc) throws IOException, SolrException; public void add(final SolrDoc solrdoc) throws IOException, SolrException;
public void add(final Collection<SolrDoc> solrdocs) throws IOException, SolrException; public void add(final Collection<SolrDoc> solrdocs) throws IOException, SolrException;
/**
* get a document from solr by given id
* @param id
* @return one result or null if no result exists
* @throws IOException
*/
public SolrDocument get(final String id) throws IOException;
/** /**
* get a query result from solr * get a query result from solr
* to get all results set the query String to "*:*" * to get all results set the query String to "*:*"
* @param querystring * @param querystring
* @throws IOException * @throws IOException
*/ */
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException; public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException;
/** /**
* get the size of the index * get the size of the index

@ -24,8 +24,8 @@ package net.yacy.kelondro.data.meta;
import java.util.Date; import java.util.Date;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.rwi.Reference;
public interface URIMetadata extends URIReference { public interface URIMetadata extends URIReference {
@ -74,10 +74,12 @@ public interface URIMetadata extends URIReference {
public String snippet(); public String snippet();
public Reference word(); public WordReference word();
public boolean isOlder(final URIMetadata other); public boolean isOlder(final URIMetadata other);
public String toString(final String snippet); public String toString(final String snippet);
public byte[] referrerHash();
} }

@ -35,6 +35,12 @@ public interface URIReference {
*/ */
public byte[] hash(); public byte[] hash();
/**
* the second half of a uri hash is the host hash
* @return
*/
public String hosthash();
/** /**
* The modification date of the URIReference is given if * The modification date of the URIReference is given if
* the record was created first and is defined with the * the record was created first and is defined with the

@ -49,6 +49,14 @@ public class URIReferenceNode extends HashMap<String, byte[]> implements URIRefe
return this.hash; return this.hash;
} }
private String hostHash = null;
@Override
public String hosthash() {
if (this.hostHash != null) return this.hostHash;
this.hostHash = ASCII.String(this.hash, 6, 6);
return this.hostHash;
}
@Override @Override
public Date moddate() { public Date moddate() {
byte[] x = this.get(MetadataVocabulary.moddate.name()); byte[] x = this.get(MetadataVocabulary.moddate.name());

@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSMessage;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MapTools; import net.yacy.kelondro.util.MapTools;
import net.yacy.peers.operation.yacyVersion; import net.yacy.peers.operation.yacyVersion;

@ -77,6 +77,7 @@ import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.services.federated.opensearch.SRURSSConnector; import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
@ -1155,7 +1156,7 @@ public final class Protocol
public static String transferIndex( public static String transferIndex(
final Seed targetSeed, final Seed targetSeed,
final ReferenceContainerCache<WordReference> indexes, final ReferenceContainerCache<WordReference> indexes,
final SortedMap<byte[], URIMetadataRow> urlCache, final SortedMap<byte[], URIMetadata> urlCache,
final boolean gzipBody, final boolean gzipBody,
final int timeout) { final int timeout) {
@ -1216,7 +1217,7 @@ public final class Protocol
} // all url's known } // all url's known
// extract the urlCache from the result // extract the urlCache from the result
final URIMetadataRow[] urls = new URIMetadataRow[uhs.length]; final URIMetadata[] urls = new URIMetadataRow[uhs.length];
for ( int i = 0; i < uhs.length; i++ ) { for ( int i = 0; i < uhs.length; i++ ) {
urls[i] = urlCache.get(ASCII.getBytes(uhs[i])); urls[i] = urlCache.get(ASCII.getBytes(uhs[i]));
if ( urls[i] == null ) { if ( urls[i] == null ) {
@ -1324,7 +1325,7 @@ public final class Protocol
private static Map<String, String> transferURL( private static Map<String, String> transferURL(
final Seed targetSeed, final Seed targetSeed,
final URIMetadataRow[] urls, final URIMetadata[] urls,
boolean gzipBody, boolean gzipBody,
final int timeout) { final int timeout) {
// this post a message to the remote message board // this post a message to the remote message board
@ -1346,7 +1347,7 @@ public final class Protocol
String resource; String resource;
int urlc = 0; int urlc = 0;
int urlPayloadSize = 0; int urlPayloadSize = 0;
for ( final URIMetadataRow url : urls ) { for ( final URIMetadata url : urls ) {
if ( url != null ) { if ( url != null ) {
resource = url.toString(); resource = url.toString();
//System.out.println("*** DEBUG resource = " + resource); //System.out.println("*** DEBUG resource = " + resource);

@ -32,7 +32,7 @@ import java.util.SortedMap;
import java.util.TreeMap; import java.util.TreeMap;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.data.word.WordReferenceRow;
@ -90,7 +90,7 @@ public class Transmission {
*/ */
private final byte[] primaryTarget; private final byte[] primaryTarget;
private final ReferenceContainerCache<WordReference> containers; private final ReferenceContainerCache<WordReference> containers;
private final SortedMap<byte[], URIMetadataRow> references; private final SortedMap<byte[], URIMetadata> references;
private final HandleSet badReferences; private final HandleSet badReferences;
private final List<Seed> targets; private final List<Seed> targets;
private int hit, miss; private int hit, miss;
@ -106,7 +106,7 @@ public class Transmission {
super(); super();
this.primaryTarget = primaryTarget; this.primaryTarget = primaryTarget;
this.containers = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength); this.containers = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength);
this.references = new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder); this.references = new TreeMap<byte[], URIMetadata>(Base64Order.enhancedCoder);
this.badReferences = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); this.badReferences = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
this.targets = targets; this.targets = targets;
this.hit = 0; this.hit = 0;
@ -175,7 +175,7 @@ public class Transmission {
notFoundx.add(e.urlhash()); notFoundx.add(e.urlhash());
continue; continue;
} }
final URIMetadataRow r = Transmission.this.segment.urlMetadata().load(e.urlhash()); final URIMetadata r = Transmission.this.segment.urlMetadata().load(e.urlhash());
if (r == null) { if (r == null) {
notFoundx.add(e.urlhash()); notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash()); this.badReferences.put(e.urlhash());

@ -45,6 +45,7 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.index.RowSpaceExceededException;
@ -332,7 +333,7 @@ public class Blacklist {
* @param entry Entry to be checked * @param entry Entry to be checked
* @return Whether the given entry is blacklisted * @return Whether the given entry is blacklisted
*/ */
public boolean isListed(final BlacklistType blacklistType, final URIMetadataRow entry) { public boolean isListed(final BlacklistType blacklistType, final URIMetadata entry) {
// Call inner method // Call inner method
return isListed(blacklistType, entry.url()); return isListed(blacklistType, entry.url());
} }

@ -111,6 +111,7 @@ import net.yacy.document.parser.html.Evaluation;
import net.yacy.gui.Tray; import net.yacy.gui.Tray;
import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
@ -391,8 +392,12 @@ public final class Switchboard extends serverSwitch
fileSizeMax, fileSizeMax,
this.useTailCache, this.useTailCache,
this.exceed134217727, this.exceed134217727,
solrLocal); solrLocal,
true, // useCitationIndex
true, // useRWI
true // useMetadata
);
// prepare a solr index profile switch list // prepare a solr index profile switch list
final File solrBackupProfile = new File("defaults/solr.keys.list"); final File solrBackupProfile = new File("defaults/solr.keys.list");
final String schemename = final String schemename =
@ -1197,7 +1202,11 @@ public final class Switchboard extends serverSwitch
fileSizeMax, fileSizeMax,
this.useTailCache, this.useTailCache,
this.exceed134217727, this.exceed134217727,
solrLocal); solrLocal,
true, // useCitationIndex
true, // useRWI
true // useMetadata
);
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
// create a crawler // create a crawler
@ -1447,7 +1456,7 @@ public final class Switchboard extends serverSwitch
if ( urlhash.length == 0 ) { if ( urlhash.length == 0 ) {
return null; return null;
} }
final URIMetadataRow le = this.index.urlMetadata().load(urlhash); final URIMetadata le = this.index.urlMetadata().load(urlhash);
if ( le != null ) { if ( le != null ) {
return le.url(); return le.url();
} }

@ -41,6 +41,7 @@ import net.yacy.document.Document;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.search.query.QueryParams; import net.yacy.search.query.QueryParams;
@ -74,7 +75,19 @@ public class DocumentIndex extends Segment
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize)
throws IOException { throws IOException {
super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false, true); super(
new Log("DocumentIndex"),
segmentPath,
cachesize,
targetFileSize * 4 - 1,
false, // useTailCache
false, // exceed134217727
true, // connectLocalSolr
true, // useCitationIndex
true, // useRWI
true // useMetadata
);
final int cores = Runtime.getRuntime().availableProcessors() + 1; final int cores = Runtime.getRuntime().availableProcessors() + 1;
this.callback = callback; this.callback = callback;
this.queue = new LinkedBlockingQueue<DigestURI>(cores * 300); this.queue = new LinkedBlockingQueue<DigestURI>(cores * 300);
@ -227,7 +240,7 @@ public class DocumentIndex extends Segment
rankedCache.start(); rankedCache.start();
// search is running; retrieve results // search is running; retrieve results
URIMetadataRow row; URIMetadata row;
final ArrayList<DigestURI> files = new ArrayList<DigestURI>(); final ArrayList<DigestURI> files = new ArrayList<DigestURI>();
while ( (row = rankedCache.takeURL(false, 1000)) != null ) { while ( (row = rankedCache.takeURL(false, 1000)) != null ) {
files.add(row.url()); files.add(row.url());

@ -49,6 +49,7 @@ import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.Cache; import net.yacy.kelondro.index.Cache;
@ -65,43 +66,38 @@ import net.yacy.search.Switchboard;
import net.yacy.search.solr.EmbeddedSolrConnector; import net.yacy.search.solr.EmbeddedSolrConnector;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import de.anomic.crawler.CrawlStacker; import de.anomic.crawler.CrawlStacker;
public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]> { public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]> {
// class objects // class objects
protected Index urlIndexFile; private final File location;
private Index urlIndexFile;
private Export exportthread; // will have a export thread assigned if exporter is running private Export exportthread; // will have a export thread assigned if exporter is running
private final File location; private String tablename;
private final String tablename;
private ArrayList<HostStat> statsDump; private ArrayList<HostStat> statsDump;
private SolrConnector localSolr, remoteSolr; private SolrConnector localSolr, remoteSolr;
public MetadataRepository( public MetadataRepository(final File path) {
final File path,
final String tablename,
final boolean useTailCache,
final boolean exceed134217727) {
this.location = path; this.location = path;
this.tablename = tablename; this.tablename = null;
Index backupIndex = null; this.urlIndexFile = null;
backupIndex = new SplitTable(this.location, tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727);
this.urlIndexFile = backupIndex; //new Cache(backupIndex, 20000000, 20000000);
this.exportthread = null; // will have a export thread assigned if exporter is running this.exportthread = null; // will have a export thread assigned if exporter is running
this.statsDump = null; this.statsDump = null;
this.remoteSolr = null; this.remoteSolr = null;
this.localSolr = null; this.localSolr = null;
} }
public void connectRemoteSolr(final SolrConnector solr) { public void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) {
this.remoteSolr = solr; if (this.urlIndexFile != null) return;
this.tablename = tablename;
this.urlIndexFile = new SplitTable(this.location, tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727);
} }
public void disconnectRemoteSolr() { public void disconnectUrlDb() {
if (this.remoteSolr == null) return; if (this.urlIndexFile == null) return;
this.remoteSolr.close(); this.urlIndexFile.close();
this.remoteSolr = null; this.urlIndexFile = null;
} }
public void connectLocalSolr() throws IOException { public void connectLocalSolr() throws IOException {
@ -123,6 +119,16 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.localSolr.close(); this.localSolr.close();
this.localSolr = null; this.localSolr = null;
} }
public void connectRemoteSolr(final SolrConnector solr) {
this.remoteSolr = solr;
}
public void disconnectRemoteSolr() {
if (this.remoteSolr == null) return;
this.remoteSolr.close();
this.remoteSolr = null;
}
public SolrConnector getLocalSolr() { public SolrConnector getLocalSolr() {
return this.localSolr; return this.localSolr;
@ -133,7 +139,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
} }
public void clearCache() { public void clearCache() {
if (this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache(); if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
if (this.statsDump != null) this.statsDump.clear(); if (this.statsDump != null) this.statsDump.clear();
this.statsDump = null; this.statsDump = null;
} }
@ -142,15 +148,22 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
if (this.exportthread != null) this.exportthread.interrupt(); if (this.exportthread != null) this.exportthread.interrupt();
if (this.urlIndexFile == null) { if (this.urlIndexFile == null) {
SplitTable.delete(this.location, this.tablename); SplitTable.delete(this.location, this.tablename);
this.urlIndexFile = new SplitTable(this.location, this.tablename, URIMetadataRow.rowdef, false, false);
} else { } else {
this.urlIndexFile.clear(); this.urlIndexFile.clear();
} }
if (this.localSolr != null) {
this.localSolr.clear();
}
// the remote solr is not cleared here because that shall be done separately
this.statsDump = null; this.statsDump = null;
} }
public int size() { public int size() {
return this.urlIndexFile == null ? 0 : this.urlIndexFile.size(); int size = 0;
size += this.urlIndexFile == null ? 0 : this.urlIndexFile.size();
size += this.localSolr == null ? 0 : this.localSolr.getSize();
size += this.remoteSolr == null ? 0 : this.remoteSolr.getSize();
return size;
} }
public void close() { public void close() {
@ -170,8 +183,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
} }
public int writeCacheSize() { public int writeCacheSize() {
if (this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize(); if (this.urlIndexFile != null && this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize();
if (this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize(); if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize();
return 0; return 0;
} }
@ -181,59 +194,69 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
* @param obrwi * @param obrwi
* @return * @return
*/ */
public URIMetadataRow load(final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi) { public URIMetadata load(final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi) {
if (this.urlIndexFile == null) return null;
if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
final byte[] urlHash = obrwi.getElement().urlhash(); final byte[] urlHash = obrwi.getElement().urlhash();
if (urlHash == null) return null; if (urlHash == null) return null;
try { if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false); final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry == null) return null; if (entry == null) return null;
return new URIMetadataRow(entry, obrwi.getElement(), obrwi.getWeight()); return new URIMetadataRow(entry, obrwi.getElement(), obrwi.getWeight());
} catch (final IOException e) { } catch (final IOException e) {
return null; Log.logException(e);
}
/*
if (this.localSolr != null) {
try {
SolrDocument doc = this.localSolr.get(ASCII.String(urlHash));
} catch (IOException e) {
Log.logException(e);
}
} }
*/
return null;
} }
public URIMetadataRow load(final byte[] urlHash) { public URIMetadata load(final byte[] urlHash) {
if (this.urlIndexFile == null) return null;
if (urlHash == null) return null; if (urlHash == null) return null;
try { if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false); final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry == null) return null; if (entry == null) return null;
return new URIMetadataRow(entry, null, 0); return new URIMetadataRow(entry, null, 0);
} catch (final IOException e) { } catch (final IOException e) {
return null; return null;
} }
return null;
} }
public void store(final URIMetadataRow entry) throws IOException { public void store(final URIMetadata entry) throws IOException {
// Check if there is a more recent Entry already in the DB // Check if there is a more recent Entry already in the DB
URIMetadataRow oldEntry; if (this.urlIndexFile != null && entry instanceof URIMetadataRow) {
if (this.urlIndexFile == null) return; // case may happen during shutdown or startup URIMetadata oldEntry = null;
try { try {
final Row.Entry oe = this.urlIndexFile.get(entry.hash(), false); final Row.Entry oe = this.urlIndexFile.get(entry.hash(), false);
oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0); oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0);
} catch (final Exception e) { } catch (final Exception e) {
Log.logException(e); Log.logException(e);
oldEntry = null; oldEntry = null;
}
if (oldEntry != null && entry.isOlder(oldEntry)) {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
// doesn't make sense, since no return value:
//entry = oldEntry;
return; // this did not need to be stored, but is updated
}
try {
this.urlIndexFile.put(((URIMetadataRow) entry).toRowEntry());
} catch (final RowSpaceExceededException e) {
throw new IOException("RowSpaceExceededException in " + this.urlIndexFile.filename() + ": " + e.getMessage());
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache();
} }
if (oldEntry != null && entry.isOlder(oldEntry)) {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
// doesn't make sense, since no return value:
//entry = oldEntry;
return; // this did not need to be stored, but is updated
}
try {
this.urlIndexFile.put(entry.toRowEntry());
} catch (final RowSpaceExceededException e) {
throw new IOException("RowSpaceExceededException in " + this.urlIndexFile.filename() + ": " + e.getMessage());
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache() ;
} }
public boolean remove(final byte[] urlHash) { public boolean remove(final byte[] urlHash) {
@ -251,13 +274,14 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
Log.logException(e); Log.logException(e);
} }
} }
try { if (this.urlIndexFile != null) try {
final Row.Entry r = this.urlIndexFile.remove(urlHash); final Row.Entry r = this.urlIndexFile.remove(urlHash);
if (r != null) this.statsDump = null; if (r != null) this.statsDump = null;
return r != null; return r != null;
} catch (final IOException e) { } catch (final IOException e) {
return false; return false;
} }
return false;
} }
public boolean exists(final byte[] urlHash) { public boolean exists(final byte[] urlHash) {
@ -297,17 +321,17 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
return keys(true, null); return keys(true, null);
} }
public CloneableIterator<URIMetadataRow> entries() throws IOException { public CloneableIterator<URIMetadata> entries() throws IOException {
// enumerates entry elements // enumerates entry elements
return new kiter(); return new kiter();
} }
public CloneableIterator<URIMetadataRow> entries(final boolean up, final String firstHash) throws IOException { public CloneableIterator<URIMetadata> entries(final boolean up, final String firstHash) throws IOException {
// enumerates entry elements // enumerates entry elements
return new kiter(up, firstHash); return new kiter(up, firstHash);
} }
public class kiter implements CloneableIterator<URIMetadataRow> { public class kiter implements CloneableIterator<URIMetadata> {
// enumerates entry elements // enumerates entry elements
private final CloneableIterator<Row.Entry> iter; private final CloneableIterator<Row.Entry> iter;
private final boolean error; private final boolean error;
@ -342,7 +366,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
} }
@Override @Override
public final URIMetadataRow next() { public final URIMetadata next() {
Row.Entry e = null; Row.Entry e = null;
if (this.iter == null) { return null; } if (this.iter == null) { return null; }
if (this.iter.hasNext()) { e = this.iter.next(); } if (this.iter.hasNext()) { e = this.iter.next(); }
@ -372,7 +396,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
final Log log = new Log("URLDBCLEANUP"); final Log log = new Log("URLDBCLEANUP");
final HashSet<String> damagedURLS = new HashSet<String>(); final HashSet<String> damagedURLS = new HashSet<String>();
try { try {
final Iterator<URIMetadataRow> eiter = entries(true, null); final Iterator<URIMetadata> eiter = entries(true, null);
int iteratorCount = 0; int iteratorCount = 0;
while (eiter.hasNext()) try { while (eiter.hasNext()) try {
eiter.next(); eiter.next();
@ -456,7 +480,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
public void run() { public void run() {
try { try {
Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet"); Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet");
final Iterator<URIMetadataRow> eiter = entries(true, null); final Iterator<URIMetadata> eiter = entries(true, null);
while (eiter.hasNext() && this.run) { while (eiter.hasNext() && this.run) {
synchronized (this) { synchronized (this) {
if (this.pause) { if (this.pause) {
@ -469,7 +493,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
} }
} }
} }
final URIMetadataRow entry = eiter.next(); final URIMetadata entry = eiter.next();
if (entry == null) { if (entry == null) {
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null"); if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null");
} else if (entry.hash() == null) { } else if (entry.hash() == null) {
@ -605,8 +629,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.count++; this.count++;
} }
} else { } else {
final Iterator<URIMetadataRow> i = entries(); // iterates indexURLEntry objects final Iterator<URIMetadata> i = entries(); // iterates indexURLEntry objects
URIMetadataRow entry; URIMetadata entry;
String url; String url;
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
@ -704,7 +728,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
// collect hashes from all domains // collect hashes from all domains
// fetch urls from the database to determine the host in clear text // fetch urls from the database to determine the host in clear text
URIMetadataRow urlref; URIMetadata urlref;
if (count < 0 || count > domainSamples.size()) count = domainSamples.size(); if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
this.statsDump = new ArrayList<HostStat>(); this.statsDump = new ArrayList<HostStat>();
final TreeSet<String> set = new TreeSet<String>(); final TreeSet<String> set = new TreeSet<String>();
@ -741,7 +765,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
*/ */
public Map<String, HostStat> domainHashResolver(final Map<String, URLHashCounter> domainSamples) { public Map<String, HostStat> domainHashResolver(final Map<String, URLHashCounter> domainSamples) {
final HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>(); final HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>();
URIMetadataRow urlref; URIMetadata urlref;
final ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>(); final ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>();
for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) { for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
@ -762,7 +786,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
// fetch urls from the database to determine the host in clear text // fetch urls from the database to determine the host in clear text
final Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first) final Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first)
URIMetadataRow urlref; URIMetadata urlref;
String urlhash; String urlhash;
count += 10; // make some more to prevent that we have to do this again after deletions too soon. count += 10; // make some more to prevent that we have to do this again after deletions too soon.
if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size(); if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size();

@ -47,6 +47,7 @@ import net.yacy.document.Parser;
import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.citation.CitationReferenceFactory; import net.yacy.kelondro.data.citation.CitationReferenceFactory;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
@ -88,7 +89,8 @@ public class Segment {
public static final int lowcachedivisor = 900; public static final int lowcachedivisor = 900;
public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB
public static final int writeBufferSize = 4 * 1024 * 1024; public static final int writeBufferSize = 4 * 1024 * 1024;
public static final String UrlDbName = "text.urlmd";
// the reference factory // the reference factory
public static final ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory(); public static final ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory();
public static final ReferenceFactory<CitationReference> citationReferenceFactory = new CitationReferenceFactory(); public static final ReferenceFactory<CitationReference> citationReferenceFactory = new CitationReferenceFactory();
@ -109,14 +111,17 @@ public class Segment {
final long maxFileSize, final long maxFileSize,
final boolean useTailCache, final boolean useTailCache,
final boolean exceed134217727, final boolean exceed134217727,
final boolean connectLocalSolr) throws IOException { final boolean connectLocalSolr,
final boolean useCitationIndex,
final boolean useRWI,
final boolean useMetadata) throws IOException {
log.logInfo("Initializing Segment '" + segmentPath + "."); log.logInfo("Initializing Segment '" + segmentPath + ".");
this.log = log; this.log = log;
this.segmentPath = segmentPath; this.segmentPath = segmentPath;
this.termIndex = new IndexCell<WordReference>( this.termIndex = useRWI ? new IndexCell<WordReference>(
segmentPath, segmentPath,
"text.index", "text.index",
wordReferenceFactory, wordReferenceFactory,
@ -125,9 +130,9 @@ public class Segment {
entityCacheMaxSize, entityCacheMaxSize,
targetFileSize, targetFileSize,
maxFileSize, maxFileSize,
writeBufferSize); writeBufferSize) : null;
this.urlCitationIndex = new IndexCell<CitationReference>( this.urlCitationIndex = useCitationIndex ? new IndexCell<CitationReference>(
segmentPath, segmentPath,
"citation.index", "citation.index",
citationReferenceFactory, citationReferenceFactory,
@ -136,10 +141,11 @@ public class Segment {
entityCacheMaxSize, entityCacheMaxSize,
targetFileSize, targetFileSize,
maxFileSize, maxFileSize,
writeBufferSize); writeBufferSize) : null;
// create LURL-db // create LURL-db
this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727); this.urlMetadata = new MetadataRepository(segmentPath);
if (useMetadata) this.urlMetadata.connectUrlDb(UrlDbName, useTailCache, exceed134217727);
if (connectLocalSolr) this.connectLocalSolr(); if (connectLocalSolr) this.connectLocalSolr();
} }
@ -148,10 +154,12 @@ public class Segment {
} }
public long RWICount() { public long RWICount() {
if (this.termIndex == null) return 0;
return this.termIndex.sizesMax(); return this.termIndex.sizesMax();
} }
public int RWIBufferCount() { public int RWIBufferCount() {
if (this.termIndex == null) return 0;
return this.termIndex.getBufferSize(); return this.termIndex.getBufferSize();
} }
@ -235,7 +243,7 @@ public class Segment {
} }
@Override @Override
public DigestURI next() { public DigestURI next() {
URIMetadataRow umr = Segment.this.urlMetadata.load(bi.next()); URIMetadata umr = Segment.this.urlMetadata.load(bi.next());
return umr.url(); return umr.url();
} }
@Override @Override
@ -260,9 +268,9 @@ public class Segment {
public void clear() { public void clear() {
try { try {
this.termIndex.clear(); if (this.termIndex != null) this.termIndex.clear();
this.urlMetadata.clear(); if (this.urlMetadata != null) this.urlMetadata.clear();
this.urlCitationIndex.clear(); if (this.urlCitationIndex != null) this.urlCitationIndex.clear();
} catch (final IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
} }
@ -328,7 +336,7 @@ public class Segment {
assert (wprop.flags != null); assert (wprop.flags != null);
ientry.setWord(wprop); ientry.setWord(wprop);
wordhash = Word.word2hash(word); wordhash = Word.word2hash(word);
try { if (this.termIndex != null) try {
this.termIndex.add(wordhash, ientry); this.termIndex.add(wordhash, ientry);
} catch (final Exception e) { } catch (final Exception e) {
Log.logException(e); Log.logException(e);
@ -354,7 +362,7 @@ public class Segment {
// assign the catchall word // assign the catchall word
ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics
try { if (this.termIndex != null) try {
this.termIndex.add(catchallHash, ientry); this.termIndex.add(catchallHash, ientry);
} catch (final Exception e) { } catch (final Exception e) {
Log.logException(e); Log.logException(e);
@ -385,9 +393,9 @@ public class Segment {
} }
public synchronized void close() { public synchronized void close() {
this.termIndex.close(); if (this.termIndex != null) this.termIndex.close();
this.urlMetadata.close(); if (this.urlMetadata != null) this.urlMetadata.close();
this.urlCitationIndex.close(); if (this.urlCitationIndex != null) this.urlCitationIndex.close();
} }
public URIMetadataRow storeDocument( public URIMetadataRow storeDocument(
@ -541,7 +549,7 @@ public class Segment {
if (urlhash == null) return 0; if (urlhash == null) return 0;
// determine the url string // determine the url string
final URIMetadataRow entry = urlMetadata().load(urlhash); final URIMetadata entry = urlMetadata().load(urlhash);
if (entry == null) return 0; if (entry == null) return 0;
if (entry.url() == null) return 0; if (entry.url() == null) return 0;
@ -612,7 +620,7 @@ public class Segment {
entry = new WordReferenceVars(containerIterator.next()); entry = new WordReferenceVars(containerIterator.next());
// System.out.println("Wordhash: "+wordHash+" UrlHash: // System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash()); // "+entry.getUrlHash());
final URIMetadataRow ue = Segment.this.urlMetadata.load(entry.urlhash()); final URIMetadata ue = Segment.this.urlMetadata.load(entry.urlhash());
if (ue == null) { if (ue == null) {
urlHashs.put(entry.urlhash()); urlHashs.put(entry.urlhash());
} else { } else {

@ -55,6 +55,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
@ -616,7 +617,7 @@ public final class RWIProcess extends Thread
* @param waitingtime the time this method may take for a result computation * @param waitingtime the time this method may take for a result computation
* @return a metadata entry for a url * @return a metadata entry for a url
*/ */
public URIMetadataRow takeURL(final boolean skipDoubleDom, final long waitingtime) { public URIMetadata takeURL(final boolean skipDoubleDom, final long waitingtime) {
// returns from the current RWI list the best URL entry and removes this entry from the list // returns from the current RWI list the best URL entry and removes this entry from the list
final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime); final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime);
int p = -1; int p = -1;
@ -627,7 +628,7 @@ public final class RWIProcess extends Thread
if ( obrwi == null ) { if ( obrwi == null ) {
return null; // all time was already wasted in takeRWI to get another element return null; // all time was already wasted in takeRWI to get another element
} }
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi); final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi);
if ( page == null ) { if ( page == null ) {
try { try {
this.misses.putUnique(obrwi.getElement().urlhash()); this.misses.putUnique(obrwi.getElement().urlhash());
@ -864,7 +865,7 @@ public final class RWIProcess extends Thread
} }
final Iterator<String> domhashs = this.hostNavigator.keys(false); final Iterator<String> domhashs = this.hostNavigator.keys(false);
URIMetadataRow row; URIMetadata row;
byte[] urlhash; byte[] urlhash;
String hosthash, hostname; String hosthash, hostname;
if ( this.hostResolver != null ) { if ( this.hostResolver != null ) {

@ -41,7 +41,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element; import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement; import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.index.RowSpaceExceededException;
@ -454,7 +454,7 @@ public class SnippetProcess {
public void run() { public void run() {
// start fetching urls and snippets // start fetching urls and snippets
URIMetadataRow page; URIMetadata page;
ResultEntry resultEntry; ResultEntry resultEntry;
//final int fetchAhead = snippetMode == 0 ? 0 : 10; //final int fetchAhead = snippetMode == 0 ? 0 : 10;
final boolean nav_topics = SnippetProcess.this.query.navigators.equals("all") || SnippetProcess.this.query.navigators.indexOf("topics",0) >= 0; final boolean nav_topics = SnippetProcess.this.query.navigators.equals("all") || SnippetProcess.this.query.navigators.indexOf("topics",0) >= 0;
@ -498,7 +498,7 @@ public class SnippetProcess {
String solrContent = null; String solrContent = null;
if (this.solr != null) { if (this.solr != null) {
SolrDocument sd = null; SolrDocument sd = null;
final SolrDocumentList sdl = this.solr.get(SolrField.id.getSolrFieldName()+ ":" + ASCII.String(page.hash()), 0, 1); final SolrDocumentList sdl = this.solr.query(SolrField.id.getSolrFieldName()+ ":" + ASCII.String(page.hash()), 0, 1);
if (!sdl.isEmpty()) { if (!sdl.isEmpty()) {
sd = sdl.get(0); sd = sdl.get(0);
} }
@ -553,7 +553,7 @@ public class SnippetProcess {
} }
} }
protected ResultEntry fetchSnippet(final URIMetadataRow page, final String solrText, final CacheStrategy cacheStrategy) { protected ResultEntry fetchSnippet(final URIMetadata page, final String solrText, final CacheStrategy cacheStrategy) {
// Snippet Fetching can has 3 modes: // Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets // 0 - do not fetch snippets
// 1 - fetch snippets offline only // 1 - fetch snippets offline only

@ -34,7 +34,7 @@ import java.util.List;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -50,7 +50,7 @@ import net.yacy.search.index.Segment;
public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEntry> { public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEntry> {
// payload objects // payload objects
private final URIMetadataRow urlentry; private final URIMetadata urlentry;
private String alternative_urlstring; private String alternative_urlstring;
private String alternative_urlname; private String alternative_urlname;
private final TextSnippet textSnippet; private final TextSnippet textSnippet;
@ -60,7 +60,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
// statistic objects // statistic objects
public long dbRetrievalTime, snippetComputationTime, ranking; public long dbRetrievalTime, snippetComputationTime, ranking;
public ResultEntry(final URIMetadataRow urlentry, public ResultEntry(final URIMetadata urlentry,
final Segment indexSegment, final Segment indexSegment,
SeedDB peers, SeedDB peers,
final TextSnippet textSnippet, final TextSnippet textSnippet,

@ -45,7 +45,7 @@ import net.yacy.document.SnippetExtractor;
import net.yacy.document.WordTokenizer; import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
@ -146,7 +146,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
public TextSnippet( public TextSnippet(
final LoaderDispatcher loader, final LoaderDispatcher loader,
final String solrText, final String solrText,
final URIMetadataRow row, final URIMetadata row,
final HandleSet queryhashes, final HandleSet queryhashes,
final CacheStrategy cacheStrategy, final CacheStrategy cacheStrategy,
final boolean pre, final boolean pre,

@ -155,7 +155,7 @@ public class EmbeddedSolrConnector extends AbstractSolrConnector implements Solr
solrdoc.addSolr(SolrField.text_t, "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."); solrdoc.addSolr(SolrField.text_t, "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.");
solr.add(solrdoc); solr.add(solrdoc);
SolrServlet.startServer("/solr", 8091, solr); SolrServlet.startServer("/solr", 8091, solr);
SolrDocumentList searchresult = solr.get(SolrField.text_t.name() + ":tempor", 0, 10); SolrDocumentList searchresult = solr.query(SolrField.text_t.name() + ":tempor", 0, 10);
for (SolrDocument d : searchresult) { for (SolrDocument d : searchresult) {
System.out.println(d.toString()); System.out.println(d.toString());
} }

@ -1,4 +1,3 @@
package net.yacy;
// yacy.java // yacy.java
// ----------------------- // -----------------------
// (C) by Michael Peter Christen; mc@yacy.net // (C) by Michael Peter Christen; mc@yacy.net
@ -23,8 +22,8 @@ package net.yacy;
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy;
//import java.io.BufferedInputStream;
import java.io.BufferedOutputStream; import java.io.BufferedOutputStream;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.BufferedWriter; import java.io.BufferedWriter;
@ -61,7 +60,7 @@ import net.yacy.cora.sorting.ScoreMap;
import net.yacy.gui.YaCyApp; import net.yacy.gui.YaCyApp;
import net.yacy.gui.framework.Browser; import net.yacy.gui.framework.Browser;
import net.yacy.kelondro.blob.MapDataMining; import net.yacy.kelondro.blob.MapDataMining;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -657,11 +656,13 @@ public final class yacy {
log.logInfo("STARTING URL CLEANUP"); log.logInfo("STARTING URL CLEANUP");
// db containing all currently loades urls // db containing all currently loades urls
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"), "text.urlmd", false, false); final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"));
currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
// db used to hold all neede urls // db used to hold all neede urls
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"), "text.urlmd", false, false); final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"));
minimizedUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total()); final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
@ -669,7 +670,14 @@ public final class yacy {
log, log,
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
10000, 10000,
Integer.MAX_VALUE, false, false, false); Integer.MAX_VALUE,
false, // useTailCache
false, // exceed134217727
false, // connectLocalSolr
false, // useCitationIndex
true, // useRWI
true // useMetadata
);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false); final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false);
long urlCounter = 0, wordCounter = 0; long urlCounter = 0, wordCounter = 0;
@ -689,7 +697,7 @@ public final class yacy {
iEntry = wordIdxEntries.next(); iEntry = wordIdxEntries.next();
final byte[] urlHash = iEntry.urlhash(); final byte[] urlHash = iEntry.urlhash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
final URIMetadataRow urlEntry = currentUrlDB.load(urlHash); final URIMetadata urlEntry = currentUrlDB.load(urlHash);
urlCounter++; urlCounter++;
minimizedUrlDB.store(urlEntry); minimizedUrlDB.store(urlEntry);
if (urlCounter % 500 == 0) { if (urlCounter % 500 == 0) {
@ -829,7 +837,8 @@ public final class yacy {
final File root = dataHome; final File root = dataHome;
final File indexroot = new File(root, "DATA/INDEX"); final File indexroot = new File(root, "DATA/INDEX");
try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {} try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"), "text.urlmd", false, false); final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"));
currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
currentUrlDB.deadlinkCleaner(); currentUrlDB.deadlinkCleaner();
currentUrlDB.close(); currentUrlDB.close();
} }
@ -849,7 +858,14 @@ public final class yacy {
log, log,
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
10000, 10000,
Integer.MAX_VALUE, false, false, false); Integer.MAX_VALUE,
false, // useTailCache
false, // exceed134217727
false, // connectLocalSolr
false, // useCitationIndex
true, // useRWI
true // useMetadata
);
indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false); indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);
} }
int counter = 0; int counter = 0;

Loading…
Cancel
Save