From 43f3345c900e69fd06d7df1792f961814e61f147 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 16 Oct 2012 18:11:57 +0200 Subject: [PATCH] - removed dependencies from URIMetadataRow and made direct access to URIMetadataNode which creates the opportunity to access Solr objects directly and use their information richness - lazy initialization of the URIMetadataNode object - should cause less computation and memory usage during search. - removed dead code --- defaults/solr.keys.list | 2 +- htroot/Bookmarks.java | 4 +- htroot/CrawlResults.java | 4 +- htroot/HostBrowser.java | 2 +- htroot/IndexControlRWIs_p.java | 8 +- htroot/IndexControlURLs_p.java | 7 +- htroot/ViewFile.java | 4 +- htroot/Vocabulary_p.java | 6 +- htroot/api/yacydoc.java | 3 +- htroot/yacy/urls.java | 4 +- htroot/yacysearch.java | 6 +- source/net/yacy/crawler/CrawlStacker.java | 4 +- .../crawler/retrieval/SitemapImporter.java | 4 +- source/net/yacy/data/ymark/YMarkMetadata.java | 4 +- .../kelondro/data/meta/URIMetadataNode.java | 85 +++++++++++-------- .../kelondro/data/meta/URIMetadataRow.java | 4 - .../kelondro/data/word/WordReferenceRow.java | 76 +---------------- .../kelondro/data/word/WordReferenceVars.java | 14 +-- source/net/yacy/kelondro/index/RowSet.java | 2 +- source/net/yacy/peers/Transmission.java | 9 +- .../peers/graphics/WebStructureGraph.java | 2 +- source/net/yacy/search/Switchboard.java | 3 +- source/net/yacy/search/index/Fulltext.java | 66 ++++---------- source/net/yacy/search/index/Segment.java | 3 +- source/net/yacy/search/query/RWIProcess.java | 5 +- 25 files changed, 116 insertions(+), 215 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index ba97e608f..8f2e44b55 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -44,7 +44,7 @@ failreason_t ## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int (mandatory field) httpstatus_i -## html status return code (i.e. \"200\" for ok), -1 if not loaded +## redirect url if the error code is 299 < httpstatus_i < 310 #httpstatus_redirect_s diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index d3c8a1fbd..fc4bedcb6 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -52,7 +52,7 @@ import net.yacy.data.BookmarksDB.Tag; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.logging.Log; import net.yacy.peers.NewsPool; import net.yacy.search.Switchboard; @@ -195,7 +195,7 @@ public class Bookmarks { final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash); if (bookmark == null) { // try to get the bookmark from the LURL database - final URIMetadata urlentry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlHash)); + final URIMetadataNode urlentry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlHash)); if (urlentry != null) try { final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay)); prop.put("mode_edit", "0"); // create mode diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index 1d29bf621..b6b982072 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -40,7 +40,7 @@ import net.yacy.crawler.data.ResultURLs; import net.yacy.crawler.data.ResultURLs.EventOrigin; import net.yacy.crawler.data.ResultURLs.InitExecEntry; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.logging.Log; import net.yacy.peers.Seed; import net.yacy.search.Switchboard; @@ -182,7 +182,7 @@ public class CrawlResults { boolean dark = true; String urlstr, urltxt; Seed initiatorSeed, executorSeed; - URIMetadata urle; + URIMetadataNode urle; int cnt = 0; final Iterator> i = ResultURLs.results(tabletype); diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 7f77b8c6a..9ca7de426 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -158,7 +158,7 @@ public class HostBrowser { String host = uri.getHost(); prop.putHTML("outbound_host", host); prop.putHTML("inbound_host", host); - String hosthash = ASCII.String(uri.hash(), 6, 12); + String hosthash = ASCII.String(uri.hash(), 6, 6); // get all files for a specific host from the index BlockingQueue docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000); diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 9246e5e07..82dc081a3 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -278,7 +278,7 @@ public class IndexControlRWIs_p { WordReferenceRow.urlEntryRow.objectOrder, index.size()); Reference iEntry; - URIMetadata lurl; + URIMetadataNode lurl; while (urlIter.hasNext()) { iEntry = urlIter.next(); lurl = segment.fulltext().getMetadata(iEntry.urlhash()); @@ -290,11 +290,7 @@ public class IndexControlRWIs_p { } urlIter.remove(); } else { - if (lurl instanceof URIMetadataRow) { - knownURLs.put(iEntry.urlhash(), (URIMetadataRow) lurl); - } else if (lurl instanceof URIMetadataNode) { - knownURLs.put(iEntry.urlhash(), ((URIMetadataNode) lurl).toRow()); - } + knownURLs.put(iEntry.urlhash(), lurl.toRow()); } } diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index fcfaf581d..f1f2c9fc8 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -42,6 +42,7 @@ import net.yacy.crawler.data.ResultURLs; import net.yacy.data.WorkTables; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.RotateIterator; @@ -175,7 +176,7 @@ public class IndexControlURLs_p { } if (post.containsKey("urlhashdelete")) { - final URIMetadata entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); + final URIMetadataNode entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); if (entry == null) { prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { @@ -233,7 +234,7 @@ public class IndexControlURLs_p { // generate list if (post.containsKey("urlhashsimilar")) { - final Iterator entryIt = new RotateIterator(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount()); + final Iterator entryIt = new RotateIterator(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount()); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:
"); URIMetadata entry; int i = 0, rows = 0, cols = 0; @@ -347,7 +348,7 @@ public class IndexControlURLs_p { prop.put("genUrlProfile_urlhash", urlhash); return prop; } - final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash()); + final URIMetadataNode le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash()); if (entry.url() == null) { prop.put("genUrlProfile", "1"); prop.put("genUrlProfile_urlhash", urlhash); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 6268633f8..26087953c 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -55,7 +55,7 @@ import net.yacy.document.WordTokenizer; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; import net.yacy.server.serverObjects; @@ -131,7 +131,7 @@ public class ViewFile { pre = post.getBoolean("pre"); } catch (final MalformedURLException e) {} - URIMetadata urlEntry = null; + URIMetadataNode urlEntry = null; // get the urlEntry that belongs to the url hash //boolean ue = urlHash.length() > 0 && indexSegment.exists(ASCII.getBytes(urlHash)); //if (ue) Log.logInfo("ViewFile", "exists(" + urlHash + ")"); diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index cde5e232c..9ccb623f8 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -35,7 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.LibraryProvider; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; @@ -86,12 +86,12 @@ public class Vocabulary_p { if (p >= 0) t = t.substring(p + 1); } if (discoverFromTitle || discoverFromTitleSplitted) { - URIMetadata m = segment.fulltext().getMetadata(u.hash()); + URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); if (m != null) t = m.dc_title(); if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; } if (discoverFromAuthor) { - URIMetadata m = segment.fulltext().getMetadata(u.hash()); + URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); if (m != null) t = m.dc_creator(); } t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index 2e1325e9d..54777795e 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -37,6 +37,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader.FileType; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; @@ -99,7 +100,7 @@ public class yacydoc { } if (urlhash == null || urlhash.isEmpty()) return prop; - final URIMetadata entry = segment.fulltext().getMetadata(urlhash.getBytes()); + final URIMetadataNode entry = segment.fulltext().getMetadata(urlhash.getBytes()); if (entry == null) return prop; if (entry.url() == null) { diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 9557a6e6e..6a9c6667f 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -34,7 +34,7 @@ import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.peers.Protocol; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; @@ -110,7 +110,7 @@ public class urls { if (urlhashes.length() % 12 != 0) return prop; final int count = urlhashes.length() / 12; int c = 0; - URIMetadata entry; + URIMetadataNode entry; DigestURI referrer; for (int i = 0; i < count; i++) { entry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1)))); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index b78b51a58..1900cb866 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -60,7 +60,7 @@ import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.Bitfield; @@ -653,7 +653,7 @@ public class yacysearch { return prop; } final String recommendHash = post.get("recommendref", ""); // urlhash - final URIMetadata urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(recommendHash)); + final URIMetadataNode urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(recommendHash)); if ( urlentry != null ) { Document[] documents = null; try { @@ -689,7 +689,7 @@ public class yacysearch { return prop; } final String bookmarkHash = post.get("bookmarkref", ""); // urlhash - final URIMetadata urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(bookmarkHash)); + final URIMetadataNode urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(bookmarkHash)); if ( urlentry != null ) { try { sb.tables.bookmarks.createBookmark( diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index ffd388ef4..b5f8062cc 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -59,7 +59,7 @@ import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.SMBLoader; import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.peers.SeedDB; @@ -447,7 +447,7 @@ public final class CrawlStacker { // check if the url is double registered final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists - final URIMetadata oldEntry = this.indexSegment.fulltext().getMetadata(url.hash()); + final URIMetadataNode oldEntry = this.indexSegment.fulltext().getMetadata(url.hash()); if (oldEntry == null) { if (dbocc != null) { // do double-check diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index 5da58742f..f7f5dd132 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -33,7 +33,7 @@ import net.yacy.crawler.data.CrawlProfile; import net.yacy.document.parser.sitemapParser; import net.yacy.document.parser.sitemapParser.URLEntry; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; @@ -84,7 +84,7 @@ public class SitemapImporter extends Thread { final String dbocc = this.sb.urlExists(nexturlhash); if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) { // the url was already loaded. we need to check the date - final URIMetadata oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash); + final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash); if (oldEntry != null) { final Date modDate = oldEntry.moddate(); // check if modDate is null diff --git a/source/net/yacy/data/ymark/YMarkMetadata.java b/source/net/yacy/data/ymark/YMarkMetadata.java index c1b219446..c04aa87d2 100644 --- a/source/net/yacy/data/ymark/YMarkMetadata.java +++ b/source/net/yacy/data/ymark/YMarkMetadata.java @@ -37,7 +37,7 @@ import net.yacy.crawler.retrieval.Response; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.index.Segment; import net.yacy.search.snippet.TextSnippet; @@ -106,7 +106,7 @@ public class YMarkMetadata { public EnumMap getMetadata() { final EnumMap metadata = new EnumMap(METADATA.class); - final URIMetadata urlEntry = this.indexSegment.fulltext().getMetadata(this.uri.hash()); + final URIMetadataNode urlEntry = this.indexSegment.fulltext().getMetadata(this.uri.hash()); if (urlEntry != null) { metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size())); metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate())); diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index b455b7251..5cda2a72f 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -52,16 +52,16 @@ import org.apache.solr.common.SolrDocument; */ public class URIMetadataNode implements URIMetadata { - private final byte[] hash; - private final String urlRaw, keywords; - private DigestURI url; - private Bitfield flags; - private final int imagec, audioc, videoc, appc; - private double lat, lon; - private long ranking; // during generation of a search result this value is set - private final SolrDocument doc; - private final String snippet; - private WordReference word; // this is only used if the url is transported via remote search requests + private byte[] hash = null; + private String urlRaw = null, keywords = null; + private DigestURI url = null; + private Bitfield flags = null; + private int imagec = -1, audioc = -1, videoc = -1, appc = -1; + private double lat = Double.NaN, lon = Double.NaN; + private long ranking = -1; // during generation of a search result this value is set + private SolrDocument doc = null; + private String snippet = null; + private WordReference word = null; // this is only used if the url is transported via remote search requests public URIMetadataNode(final SolrDocument doc) { this.doc = doc; @@ -76,30 +76,6 @@ public class URIMetadataNode implements URIMetadata { Log.logException(e); this.url = null; } - - // to set the flags bitfield we need to pre-load some values from the Solr document - this.keywords = getString(YaCySchema.keywords); - this.imagec = getInt(YaCySchema.imagescount_i); - this.audioc = getInt(YaCySchema.audiolinkscount_i); - this.videoc = getInt(YaCySchema.videolinkscount_i); - this.appc = getInt(YaCySchema.videolinkscount_i); - this.lon = 0.0d; - this.lat = 0.0d; - String latlon = (String) this.doc.getFieldValue(YaCySchema.coordinate_p.name()); - if (latlon != null) { - int p = latlon.indexOf(','); - if (p > 0) { - this.lat = Double.parseDouble(latlon.substring(0, p)); - this.lon = Double.parseDouble(latlon.substring(p + 1)); - } - } - this.flags = new Bitfield(); - if (this.keywords != null && this.keywords.indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true); - if (this.lon != 0.0d || this.lat != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true); - if (this.imagec > 0) this.flags.set(Condenser.flag_cat_hasimage, true); - if (this.audioc > 0) this.flags.set(Condenser.flag_cat_hasaudio, true); - if (this.videoc > 0) this.flags.set(Condenser.flag_cat_hasvideo, true); - if (this.appc > 0) this.flags.set(Condenser.flag_cat_hasapp, true); } public URIMetadataNode(final SolrDocument doc, final WordReference searchedWord, final long ranking) { @@ -206,16 +182,32 @@ public class URIMetadataNode implements URIMetadata { @Override public String dc_subject() { + if (this.keywords == null) { + this.keywords = getString(YaCySchema.keywords); + } return this.keywords; } @Override public double lat() { + if (this.lat == Double.NaN) { + this.lon = 0.0d; + this.lat = 0.0d; + String latlon = (String) this.doc.getFieldValue(YaCySchema.coordinate_p.name()); + if (latlon != null) { + int p = latlon.indexOf(','); + if (p > 0) { + this.lat = Double.parseDouble(latlon.substring(0, p)); + this.lon = Double.parseDouble(latlon.substring(p + 1)); + } + } + } return this.lat; } @Override public double lon() { + if (this.lon == Double.NaN) lat(); return this.lon; } @@ -242,7 +234,7 @@ public class URIMetadataNode implements URIMetadata { @Override public char doctype() { ArrayList a = getArrayList(YaCySchema.content_type); - if (a == null || a.size() == 0) return Response.docType(this.url); + if (a == null || a.size() == 0) return Response.docType(url()); return Response.docType(a.get(0)); } @@ -268,6 +260,15 @@ public class URIMetadataNode implements URIMetadata { @Override public Bitfield flags() { + if (flags == null) { + this.flags = new Bitfield(); + if (dc_subject() != null && dc_subject().indexOf("indexof") >= 0) this.flags.set(Condenser.flag_cat_indexof, true); + if (lon() != 0.0d || lat() != 0.0d) this.flags.set(Condenser.flag_cat_haslocation, true); + if (limage() > 0) this.flags.set(Condenser.flag_cat_hasimage, true); + if (laudio() > 0) this.flags.set(Condenser.flag_cat_hasaudio, true); + if (lvideo() > 0) this.flags.set(Condenser.flag_cat_hasvideo, true); + if (lapp() > 0) this.flags.set(Condenser.flag_cat_hasapp, true); + } return this.flags; } @@ -288,21 +289,33 @@ public class URIMetadataNode implements URIMetadata { @Override public int limage() { + if (this.imagec == -1) { + this.imagec = getInt(YaCySchema.imagescount_i); + } return this.imagec; } @Override public int laudio() { + if (this.audioc == -1) { + this.audioc = getInt(YaCySchema.audiolinkscount_i); + } return this.audioc; } @Override public int lvideo() { + if (this.videoc == -1) { + this.videoc = getInt(YaCySchema.videolinkscount_i); + } return this.videoc; } @Override public int lapp() { + if (this.appc == -1) { + this.appc = getInt(YaCySchema.videolinkscount_i); + } return this.appc; } @@ -337,7 +350,7 @@ public class URIMetadataNode implements URIMetadata { return false; } - public static StringBuilder corePropList(URIMetadata md) { + protected static StringBuilder corePropList(URIMetadata md) { // generate a parseable string; this is a simple property-list final StringBuilder s = new StringBuilder(300); diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index eedff3435..d191b2dff 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -300,10 +300,6 @@ public class URIMetadataRow implements URIMetadata { } } - public Row.Entry toRowEntry() { - return this.entry; - } - @Override public byte[] hash() { // return a url-hash, based on the md5 algorithm diff --git a/source/net/yacy/kelondro/data/word/WordReferenceRow.java b/source/net/yacy/kelondro/data/word/WordReferenceRow.java index 16a197857..f389b95b8 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceRow.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceRow.java @@ -28,8 +28,6 @@ package net.yacy.kelondro.data.word; import java.util.ArrayList; import java.util.Collection; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.date.MicroDate; import net.yacy.cora.document.ASCII; @@ -79,9 +77,8 @@ public final class WordReferenceRow extends AbstractReference implements WordRef * object for termination of concurrent blocking queue processing */ public static final Row.Entry poisonRowEntry = urlEntryRow.newEntry(); - private static final WordReferenceRow poison = new WordReferenceRow(poisonRowEntry); - - // static properties + + // static properties private static final int col_urlhash = 0; // h 12 the url hash b64-encoded private static final int col_lastModified = 1; // a 2 last-modified time of the document where word appears private static final int col_freshUntil = 2; // s 2 TTL for the word, so it can be removed easily if the TTL is short @@ -204,70 +201,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef this.entry.setCol(col_posinphrase, word.posInPhrase); this.entry.setCol(col_posofphrase, word.numOfPhrase); } - - public static class ExternalParser { - private static final String PIN = "_"; - private final BlockingQueue in; - private final BlockingQueue out; - private Thread[] worker; - public ExternalParser(final int concurrency) { - this.in = new LinkedBlockingQueue(); - this.out = new LinkedBlockingQueue(); - for (int i = 0; i < concurrency; i++) { - this.worker[i] = new Thread() { - @Override - public void run() { - Thread.currentThread().setName("WordReferenceRow.ExternalParser:" + concurrency); - String s; - try { - while ((s = ExternalParser.this.in.take()) != PIN) { - ExternalParser.this.out.put(new WordReferenceRow(s)); - } - } catch (final InterruptedException e) { - } - } - }; - this.worker[i].start(); - } - } - public ExternalParser() { - this(Runtime.getRuntime().availableProcessors()); - } - public void put(final String s) { - try { - this.in.put(s); - } catch (final InterruptedException e) { - } - } - public void terminate() { - for (@SuppressWarnings("unused") final Thread w : this.worker) { - try { - this.in.put(PIN); - } catch (final InterruptedException e) { - } - } - for (final Thread w : this.worker) { - try { - if (w.isAlive()) w.join(); - } catch (final InterruptedException e) { - } - } - try { - this.out.put(poison); - } catch (final InterruptedException e) { - } - } - public WordReferenceRow take() { - WordReferenceRow row; - try { - row = this.out.take(); - } catch (final InterruptedException e) { - return poison; - } - return row; - } - } - + public WordReferenceRow(final String external) { this.entry = urlEntryRow.newEntry(external, true); } @@ -276,9 +210,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef this.entry = urlEntryRow.newEntry(row); } - public WordReferenceRow(final byte[] row, final int offset, final boolean clone) { - this.entry = urlEntryRow.newEntry(row, offset, clone); - } + public WordReferenceRow(final Row.Entry rentry) { // no cloning is necessary since there is no further manipulation after this initial instantiation diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index 5e19a1c87..ff12169f4 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -55,7 +55,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc */ public static final WordReferenceVars poison = new WordReferenceVars(); private static int cores = Runtime.getRuntime().availableProcessors(); - public static final byte[] default_language = UTF8.getBytes("uk"); + protected static final byte[] default_language = UTF8.getBytes("uk"); private final Bitfield flags; private long lastModified; @@ -274,7 +274,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc return this.posofphrase; } - public WordReferenceRow toRowEntry() { + private WordReferenceRow toRowEntry() { return new WordReferenceRow( this.urlHash, this.urllength, // byte-length of complete URL @@ -470,13 +470,13 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc return vars; } - public static class TransformDistributor extends Thread { + private static class TransformDistributor extends Thread { private ReferenceContainer container; private BlockingQueue out; private long maxtime; - public TransformDistributor(final ReferenceContainer container, final BlockingQueue out, final long maxtime) { + private TransformDistributor(final ReferenceContainer container, final BlockingQueue out, final long maxtime) { this.container = container; this.out = out; this.maxtime = maxtime; @@ -521,19 +521,19 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc } } - public static class TransformWorker extends Thread { + private static class TransformWorker extends Thread { private BlockingQueue in; private BlockingQueue out; private long maxtime; - public TransformWorker(final BlockingQueue out, final long maxtime) { + private TransformWorker(final BlockingQueue out, final long maxtime) { this.in = new LinkedBlockingQueue(); this.out = out; this.maxtime = maxtime; } - public void add(final Row.Entry entry) { + private void add(final Row.Entry entry) { try { this.in.put(entry); } catch (final InterruptedException e) { diff --git a/source/net/yacy/kelondro/index/RowSet.java b/source/net/yacy/kelondro/index/RowSet.java index 3c33ef0df..e47d80876 100644 --- a/source/net/yacy/kelondro/index/RowSet.java +++ b/source/net/yacy/kelondro/index/RowSet.java @@ -293,7 +293,7 @@ public class RowSet extends RowCollection implements Index, Iterable, if (this.rowdef.objectOrder != null && this.rowdef.objectOrder instanceof Base64Order) { // first try to find in sorted area - assert this.rowdef.objectOrder.wellformed(a, astart, this.rowdef.primaryKeyLength) : "not wellformed: " + UTF8.String(a, astart, this.rowdef.primaryKeyLength); + assert this.rowdef.objectOrder.wellformed(a, astart, this.rowdef.primaryKeyLength) : "not wellformed: " + ASCII.String(a, astart, this.rowdef.primaryKeyLength); } // first try to find in sorted area diff --git a/source/net/yacy/peers/Transmission.java b/source/net/yacy/peers/Transmission.java index 238ab2deb..9928ab7db 100644 --- a/source/net/yacy/peers/Transmission.java +++ b/source/net/yacy/peers/Transmission.java @@ -35,7 +35,6 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -175,16 +174,12 @@ public class Transmission { notFoundx.add(e.urlhash()); continue; } - final URIMetadata r = Transmission.this.segment.fulltext().getMetadata(e.urlhash()); + final URIMetadataNode r = Transmission.this.segment.fulltext().getMetadata(e.urlhash()); if (r == null) { notFoundx.add(e.urlhash()); this.badReferences.put(e.urlhash()); } else { - if (r instanceof URIMetadataRow) { - this.references.put(e.urlhash(), (URIMetadataRow) r); - } else if (r instanceof URIMetadataNode) { - this.references.put(e.urlhash(), ((URIMetadataNode) r).toRow()); - } + this.references.put(e.urlhash(), r.toRow()); } } // now delete all references that were not found diff --git a/source/net/yacy/peers/graphics/WebStructureGraph.java b/source/net/yacy/peers/graphics/WebStructureGraph.java index a3e8305ae..cdffc2f6b 100644 --- a/source/net/yacy/peers/graphics/WebStructureGraph.java +++ b/source/net/yacy/peers/graphics/WebStructureGraph.java @@ -571,7 +571,7 @@ public class WebStructureGraph { for ( final MultiProtocolURI u : lro.globalRefURLs ) { if (Switchboard.getSwitchboard().shallTerminate()) break; du = DigestURI.toDigestURI(u); - hosthash = ASCII.String(du.hash(), 6, 12); + hosthash = ASCII.String(du.hash(), 6, 6); if (!exists(hosthash)) { // this must be recorded as an host with no references synchronized ( this.structure_new ) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index d337e3528..b4e802539 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -147,6 +147,7 @@ import net.yacy.interaction.contentcontrol.ContentControlImportThread; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.rwi.ReferenceContainer; @@ -1529,7 +1530,7 @@ public final class Switchboard extends serverSwitch if ( urlhash.length == 0 ) { return null; } - final URIMetadata le = this.index.fulltext().getMetadata(urlhash); + final URIMetadataNode le = this.index.fulltext().getMetadata(urlhash); if ( le != null ) { return le.url(); } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 8183eff74..036aff09a 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -198,17 +198,17 @@ public final class Fulltext implements Iterable { * @param obrwi * @return */ - public URIMetadata getMetadata(WordReference wre, long weight) { + public URIMetadataNode getMetadata(WordReference wre, long weight) { if (wre == null) return null; // all time was already wasted in takeRWI to get another element return getMetadata(wre.urlhash(), wre, weight); } - public URIMetadata getMetadata(final byte[] urlHash) { + public URIMetadataNode getMetadata(final byte[] urlHash) { if (urlHash == null) return null; return getMetadata(urlHash, null, 0); } - private URIMetadata getMetadata(final byte[] urlHash, WordReference wre, long weight) { + private URIMetadataNode getMetadata(final byte[] urlHash, WordReference wre, long weight) { // get the metadata from Solr try { @@ -227,46 +227,9 @@ public final class Fulltext implements Iterable { final Row.Entry entry = this.urlIndexFile.remove(urlHash); if (entry == null) return null; URIMetadataRow row = new URIMetadataRow(entry, wre, weight); - this.putDocument(this.solrScheme.metadata2solr(row)); - return row; - } catch (final IOException e) { - Log.logException(e); - } - - return null; - } - - public SolrDocument getDocument(WordReference wre, long weight) { - if (wre == null) return null; // all time was already wasted in takeRWI to get another element - return getDocument(wre.urlhash(), wre, weight); - } - - public SolrDocument getDocument(final byte[] urlHash) { - if (urlHash == null) return null; - return getDocument(urlHash, null, 0); - } - - private SolrDocument getDocument(final byte[] urlHash, WordReference wre, long weight) { - - // get the document from Solr - try { - SolrDocument doc = this.solr.get(ASCII.String(urlHash)); - if (doc != null) { - if (this.urlIndexFile != null) this.urlIndexFile.remove(urlHash); - return doc; - } - } catch (IOException e) { - Log.logException(e); - } - - // get the document from the old metadata index - if (this.urlIndexFile != null) try { - // slow migration to solr - final Row.Entry entry = this.urlIndexFile.remove(urlHash); - if (entry == null) return null; - URIMetadataRow row = new URIMetadataRow(entry, wre, weight); - this.putDocument(this.solrScheme.metadata2solr(row)); - return ClientUtils.toSolrDocument(getSolrScheme().metadata2solr(row)); + SolrInputDocument solrInput = this.solrScheme.metadata2solr(row); + this.putDocument(solrInput); + return new URIMetadataNode(ClientUtils.toSolrDocument(solrInput), wre, weight); } catch (final IOException e) { Log.logException(e); } @@ -303,6 +266,7 @@ public final class Fulltext implements Iterable { public void putMetadata(final URIMetadata entry) throws IOException { if (entry instanceof URIMetadataNode) { putDocument(ClientUtils.toSolrInputDocument(((URIMetadataNode) entry).getDocument())); + return; } assert entry instanceof URIMetadataRow; URIMetadataRow row = (URIMetadataRow) entry; @@ -399,12 +363,12 @@ public final class Fulltext implements Iterable { true); } - public CloneableIterator entries() { + public CloneableIterator entries() { // enumerates entry elements final Iterator ids = iterator(); - return new CloneableIterator() { + return new CloneableIterator() { @Override - public CloneableIterator clone(final Object secondHash) { + public CloneableIterator clone(final Object secondHash) { return this; } @Override @@ -412,7 +376,7 @@ public final class Fulltext implements Iterable { return ids.hasNext(); } @Override - public final URIMetadata next() { + public final URIMetadataNode next() { byte[] id = ids.next(); if (id == null) return null; return getMetadata(id); @@ -551,7 +515,7 @@ public final class Fulltext implements Iterable { this.count++; } } else { - final Iterator i = entries(); // iterates indexURLEntry objects + final Iterator i = entries(); // iterates indexURLEntry objects URIMetadata entry; String url; while (i.hasNext()) { @@ -650,7 +614,7 @@ public final class Fulltext implements Iterable { // collect hashes from all domains // fetch urls from the database to determine the host in clear text - URIMetadata urlref; + URIMetadataNode urlref; if (count < 0 || count > domainSamples.size()) count = domainSamples.size(); this.statsDump = new ArrayList(); final TreeSet set = new TreeSet(); @@ -687,7 +651,7 @@ public final class Fulltext implements Iterable { */ public Map domainHashResolver(final Map domainSamples) { final HashMap hostMap = new HashMap(); - URIMetadata urlref; + URIMetadataNode urlref; final ScoreMap hosthashScore = new ConcurrentScoreMap(); for (final Map.Entry e: domainSamples.entrySet()) { @@ -708,7 +672,7 @@ public final class Fulltext implements Iterable { // fetch urls from the database to determine the host in clear text final Iterator j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first) - URIMetadata urlref; + URIMetadataNode urlref; String urlhash; count += 10; // make some more to prevent that we have to do this again after deletions too soon. if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size(); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 8f67db66d..329f328c2 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -57,6 +57,7 @@ import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReferenceFactory; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -506,7 +507,7 @@ public class Segment { if (urlhash == null) return 0; // determine the url string - final URIMetadata entry = fulltext().getMetadata(urlhash); + final URIMetadataNode entry = fulltext().getMetadata(urlhash); if (entry == null) return 0; if (entry.url() == null) return 0; diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java index fa4b34fb7..23c954031 100644 --- a/source/net/yacy/search/query/RWIProcess.java +++ b/source/net/yacy/search/query/RWIProcess.java @@ -59,6 +59,7 @@ import net.yacy.document.LibraryProvider; import net.yacy.interaction.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -635,7 +636,7 @@ public final class RWIProcess extends Thread if ( obrwi == null ) { return null; // all time was already wasted in takeRWI to get another element } - final URIMetadata page = this.query.getSegment().fulltext().getMetadata(obrwi.getElement(), obrwi.getWeight()); + final URIMetadataNode page = this.query.getSegment().fulltext().getMetadata(obrwi.getElement(), obrwi.getWeight()); if ( page == null ) { try { this.misses.putUnique(obrwi.getElement().urlhash()); @@ -896,7 +897,7 @@ public final class RWIProcess extends Thread } final Iterator domhashs = this.hostNavigator.keys(false); - URIMetadata row; + URIMetadataNode row; byte[] urlhash; String hosthash, hostname; if ( this.hostResolver != null ) {