diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index e4b84a1cf..203870d24 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -46,7 +46,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.logging.Log; import net.yacy.peers.NewsPool; import net.yacy.search.Switchboard; @@ -194,7 +194,7 @@ public class Bookmarks { final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash); if (bookmark == null) { // try to get the bookmark from the LURL database - final URIMetadataRow urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash)); + final URIMetadata urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash)); if (urlentry != null) try { final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null)); prop.put("mode_edit", "0"); // create mode diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index ee0582325..0dae87a22 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -35,7 +35,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.logging.Log; import net.yacy.peers.Seed; import net.yacy.search.Switchboard; @@ -178,7 +178,7 @@ public class CrawlResults { boolean dark = true; String urlstr, urltxt; Seed initiatorSeed, executorSeed; - URIMetadataRow urle; + URIMetadata urle; int cnt = 0; final Iterator> i = ResultURLs.results(tabletype); diff --git a/htroot/IndexControlRWIs_p.html b/htroot/IndexControlRWIs_p.html index 448b3d2ee..430782a6b 100644 --- a/htroot/IndexControlRWIs_p.html +++ b/htroot/IndexControlRWIs_p.html @@ -33,10 +33,10 @@
Index Deletion

- #(solr)#::
#(/solr)# + />
+ #(solr)#::
#(/solr)#


diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 6183a0c77..0ea62aaa7 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -42,6 +42,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -156,7 +157,7 @@ public class IndexControlRWIs_p { if ( post.get("deleteIndex", "").equals("on") ) { segment.clear(); } - if ( post.get("deleteSolr", "").equals("on") && sb.index.getRemoteSolr() != null) { + if ( post.get("deleteRemoteSolr", "").equals("on") && sb.index.getRemoteSolr() != null) { try { sb.index.getRemoteSolr().clear(); } catch ( final Exception e ) { @@ -307,15 +308,15 @@ public class IndexControlRWIs_p { index = segment.termIndex().get(keyhash, null); // built urlCache final Iterator urlIter = index.entries(); - final TreeMap knownURLs = - new TreeMap(Base64Order.enhancedCoder); + final TreeMap knownURLs = + new TreeMap(Base64Order.enhancedCoder); final HandleSet unknownURLEntries = new HandleSet( WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, index.size()); Reference iEntry; - URIMetadataRow lurl; + URIMetadata lurl; while (urlIter.hasNext()) { iEntry = urlIter.next(); lurl = segment.urlMetadata().load(iEntry.urlhash()); @@ -413,7 +414,7 @@ public class IndexControlRWIs_p { } catch ( final RowSpaceExceededException e ) { Log.logException(e); } - final URIMetadataRow e = segment.urlMetadata().load(b); + final URIMetadata e = segment.urlMetadata().load(b); segment.urlMetadata().remove(b); if ( e != null ) { url = e.url(); @@ -448,7 +449,7 @@ public class IndexControlRWIs_p { } catch ( final RowSpaceExceededException e ) { Log.logException(e); } - final URIMetadataRow e = segment.urlMetadata().load(b); + final URIMetadata e = segment.urlMetadata().load(b); segment.urlMetadata().remove(b); if ( e != null ) { url = e.url(); @@ -514,7 +515,7 @@ public class IndexControlRWIs_p { prop.put("genUrlList_lines", maxlines); int i = 0; DigestURI url; - URIMetadataRow entry; + URIMetadata entry; String us; long rn = -1; while ( !ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null ) { diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 4527ca016..820c8b4f6 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -35,7 +35,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; @@ -132,7 +132,7 @@ public class IndexControlURLs_p { } if (post.containsKey("urlhashdelete")) { - final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash)); + final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash)); if (entry == null) { prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { @@ -166,7 +166,7 @@ public class IndexControlURLs_p { final DigestURI url = new DigestURI(urlstring); urlhash = ASCII.String(url.hash()); prop.put("urlhash", urlhash); - final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash)); + final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash)); if (entry == null) { prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true)); prop.putHTML("urlstring", urlstring); @@ -184,7 +184,7 @@ public class IndexControlURLs_p { } if (post.containsKey("urlhashsearch")) { - final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash)); + final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash)); if (entry == null) { prop.putHTML("result", "No Entry for URL hash " + urlhash); } else { @@ -199,9 +199,9 @@ public class IndexControlURLs_p { // generate list if (post.containsKey("urlhashsimilar")) { try { - final Iterator entryIt = new RotateIterator(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax()); + final Iterator entryIt = new RotateIterator(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax()); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:
"); - URIMetadataRow entry; + URIMetadata entry; int i = 0, rows = 0, cols = 0; prop.put("urlhashsimilar", "1"); while (entryIt.hasNext() && i < 256) { @@ -303,14 +303,14 @@ public class IndexControlURLs_p { return prop; } - private static serverObjects genUrlProfile(final Segment segment, final URIMetadataRow entry, final String urlhash) { + private static serverObjects genUrlProfile(final Segment segment, final URIMetadata entry, final String urlhash) { final serverObjects prop = new serverObjects(); if (entry == null) { prop.put("genUrlProfile", "1"); prop.put("genUrlProfile_urlhash", urlhash); return prop; } - final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash()); + final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash()); if (entry.url() == null) { prop.put("genUrlProfile", "1"); prop.put("genUrlProfile_urlhash", urlhash); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 1e6ce3271..712aecb5f 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -51,7 +51,7 @@ import net.yacy.document.WordTokenizer; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; @@ -112,7 +112,7 @@ public class ViewFile { // get the url hash from which the content should be loaded String urlHash = post.get("urlHash", ""); - URIMetadataRow urlEntry = null; + URIMetadata urlEntry = null; // get the urlEntry that belongs to the url hash if (urlHash.length() > 0 && (urlEntry = indexSegment.urlMetadata().load(ASCII.getBytes(urlHash))) != null) { // get the url that belongs to the entry diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 531169bc8..8f10f903a 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -35,7 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.LibraryProvider; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; @@ -86,12 +86,12 @@ public class Vocabulary_p { if (p >= 0) t = t.substring(p + 1); } if (discoverFromTitle || discoverFromTitleSplitted) { - URIMetadataRow m = segment.urlMetadata().load(u.hash()); + URIMetadata m = segment.urlMetadata().load(u.hash()); if (m != null) t = m.dc_title(); if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; } if (discoverFromAuthor) { - URIMetadataRow m = segment.urlMetadata().load(u.hash()); + URIMetadata m = segment.urlMetadata().load(u.hash()); if (m != null) t = m.dc_creator(); } t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); diff --git a/htroot/api/yacydoc.java b/htroot/api/yacydoc.java index 4a0ce0a30..1e26c55b3 100644 --- a/htroot/api/yacydoc.java +++ b/htroot/api/yacydoc.java @@ -35,7 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader.FileType; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; @@ -97,13 +97,13 @@ public class yacydoc { } if (urlhash == null || urlhash.isEmpty()) return prop; - final URIMetadataRow entry = segment.urlMetadata().load(urlhash.getBytes()); + final URIMetadata entry = segment.urlMetadata().load(urlhash.getBytes()); if (entry == null) return prop; if (entry.url() == null) { return prop; } - final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash()); + final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash()); prop.putXML("dc_title", entry.dc_title()); prop.putXML("dc_creator", entry.dc_creator()); diff --git a/htroot/yacy/urls.java b/htroot/yacy/urls.java index 0829b7f81..58e8acc93 100644 --- a/htroot/yacy/urls.java +++ b/htroot/yacy/urls.java @@ -31,7 +31,7 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.peers.Protocol; import net.yacy.search.Switchboard; import de.anomic.crawler.NoticedURL; @@ -110,7 +110,7 @@ public class urls { if (urlhashes.length() % 12 != 0) return prop; final int count = urlhashes.length() / 12; int c = 0; - URIMetadataRow entry; + URIMetadata entry; DigestURI referrer; for (int i = 0; i < count; i++) { entry = sb.index.urlMetadata().load(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1)))); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 5b67b9c01..81d61d423 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -57,7 +57,7 @@ import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; import net.yacy.document.geolocation.GeoLocation; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.logging.Log; @@ -660,7 +660,7 @@ public class yacysearch { return prop; } final String recommendHash = post.get("recommendref", ""); // urlhash - final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash)); + final URIMetadata urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash)); if ( urlentry != null ) { Document[] documents = null; try { @@ -696,7 +696,7 @@ public class yacysearch { return prop; } final String bookmarkHash = post.get("bookmarkref", ""); // urlhash - final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(bookmarkHash)); + final URIMetadata urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(bookmarkHash)); if ( urlentry != null ) { try { sb.tables.bookmarks.createBookmark( diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index d28578f79..596b072d2 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -45,7 +45,7 @@ import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.workflow.WorkflowProcessor; @@ -439,7 +439,7 @@ public final class CrawlStacker { // check if the url is double registered final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists - final URIMetadataRow oldEntry = this.indexSegment.urlMetadata().load(url.hash()); + final URIMetadata oldEntry = this.indexSegment.urlMetadata().load(url.hash()); if (oldEntry == null) { if (dbocc != null) { // do double-check diff --git a/source/de/anomic/crawler/SitemapImporter.java b/source/de/anomic/crawler/SitemapImporter.java index ad70dbd3d..22189886d 100644 --- a/source/de/anomic/crawler/SitemapImporter.java +++ b/source/de/anomic/crawler/SitemapImporter.java @@ -32,7 +32,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.document.parser.sitemapParser; import net.yacy.document.parser.sitemapParser.URLEntry; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import de.anomic.crawler.retrieval.Request; @@ -84,7 +84,7 @@ public class SitemapImporter extends Thread { final String dbocc = this.sb.urlExists(nexturlhash); if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) { // the url was already loaded. we need to check the date - final URIMetadataRow oldEntry = this.sb.index.urlMetadata().load(nexturlhash); + final URIMetadata oldEntry = this.sb.index.urlMetadata().load(nexturlhash); if (oldEntry != null) { final Date modDate = oldEntry.moddate(); // check if modDate is null diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index e90864e4d..c2693ca52 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -419,7 +419,8 @@ public class URLAnalysis { public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException { System.out.println("INDEX DIFF URL-COL startup"); final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile)); - final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false); + final MetadataRepository mr = new MetadataRepository(new File(metadataPath)); + mr.connectUrlDb(Segment.UrlDbName, false, false); final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000); System.out.println("INDEX DIFF URL-COL loaded dump, starting diff"); final long start = System.currentTimeMillis(); @@ -447,7 +448,8 @@ public class URLAnalysis { public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException { // format: 0=text, 1=html, 2=rss/xml System.out.println("URL EXPORT startup"); - final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false); + final MetadataRepository mr = new MetadataRepository(new File(metadataPath)); + mr.connectUrlDb(Segment.UrlDbName, false, false); final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile)); System.out.println("URL EXPORT loaded dump, starting export"); final Export e = mr.export(new File(export), ".*", hs, format, false); @@ -461,7 +463,8 @@ public class URLAnalysis { public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException { System.out.println("URL DELETE startup"); - final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false); + final MetadataRepository mr = new MetadataRepository(new File(metadataPath)); + mr.connectUrlDb(Segment.UrlDbName, false, false); final int mrSize = mr.size(); final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile)); System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize); diff --git a/source/de/anomic/data/ymark/YMarkMetadata.java b/source/de/anomic/data/ymark/YMarkMetadata.java index f4236c7ac..20d1719aa 100644 --- a/source/de/anomic/data/ymark/YMarkMetadata.java +++ b/source/de/anomic/data/ymark/YMarkMetadata.java @@ -36,7 +36,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.index.Segment; import de.anomic.crawler.retrieval.Response; @@ -105,7 +105,7 @@ public class YMarkMetadata { public EnumMap getMetadata() { final EnumMap metadata = new EnumMap(METADATA.class); - final URIMetadataRow urlEntry = this.indexSegment.urlMetadata().load(this.uri.hash()); + final URIMetadata urlEntry = this.indexSegment.urlMetadata().load(this.uri.hash()); if (urlEntry != null) { metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size())); metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate())); diff --git a/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java b/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java index 981eda488..f0b040258 100644 --- a/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java @@ -34,6 +34,7 @@ import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest; import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; @@ -89,7 +90,7 @@ public class AbstractSolrConnector implements SolrConnector { @Override public long getSize() { try { - final SolrDocumentList list = get("*:*", 0, 1); + final SolrDocumentList list = query("*:*", 0, 1); return list.getNumFound(); } catch (final Throwable e) { Log.logException(e); @@ -132,8 +133,8 @@ public class AbstractSolrConnector implements SolrConnector { @Override public boolean exists(final String id) throws IOException { try { - final SolrDocumentList list = get(SolrField.id.getSolrFieldName() + ":" + id, 0, 1); - return list.getNumFound() > 0; + final SolrDocument doc = get(id); + return doc != null; } catch (final Throwable e) { Log.logException(e); return false; @@ -186,7 +187,7 @@ public class AbstractSolrConnector implements SolrConnector { * @throws IOException */ @Override - public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException { + public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException { // construct query final SolrQuery query = new SolrQuery(); query.setQuery(querystring); @@ -209,8 +210,33 @@ public class AbstractSolrConnector implements SolrConnector { } catch (final Throwable e) { throw new IOException(e); } + } + + /** + * get a document from solr by given id + * @param id + * @return one result or null if no result exists + * @throws IOException + */ + @Override + public SolrDocument get(final String id) throws IOException { + // construct query + StringBuffer sb = new StringBuffer(id.length() + 3); + sb.append(SolrField.id.getSolrFieldName()).append(':').append(id); + final SolrQuery query = new SolrQuery(); + query.setQuery(sb.toString()); + query.setRows(1); + query.setStart(0); - //return result; + // query the server + try { + final QueryResponse rsp = this.server.query( query ); + final SolrDocumentList docs = rsp.getResults(); + if (docs.isEmpty()) return null; + return docs.get(0); + } catch (final Throwable e) { + throw new IOException(e); + } } } diff --git a/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java b/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java index 3660ae43d..766500e7c 100644 --- a/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java @@ -5,6 +5,7 @@ import java.util.Collection; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; +import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; @@ -111,6 +112,11 @@ public class MultipleSolrConnector implements SolrConnector { return this.solr.exists(id); } + @Override + public SolrDocument get(String id) throws IOException { + return this.solr.get(id); + } + @Override public void add(final SolrDoc solrdoc) throws IOException, SolrException { try { @@ -132,8 +138,8 @@ public class MultipleSolrConnector implements SolrConnector { } @Override - public SolrDocumentList get(String querystring, int offset, int count) throws IOException { - return this.solr.get(querystring, offset, count); + public SolrDocumentList query(String querystring, int offset, int count) throws IOException { + return this.solr.query(querystring, offset, count); } @Override diff --git a/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java b/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java index f3863732a..f9acb44d8 100644 --- a/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java @@ -28,6 +28,7 @@ import java.io.IOException; import java.util.Collection; import java.util.List; +import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; @@ -120,6 +121,21 @@ public class RetrySolrConnector implements SolrConnector { return false; } + @Override + public SolrDocument get(String id) throws IOException { + final long t = System.currentTimeMillis() + this.retryMaxTime; + Throwable ee = null; + while (System.currentTimeMillis() < t) try { + return this.solrConnector.get(id); + } catch (final Throwable e) { + ee = e; + try {Thread.sleep(10);} catch (final InterruptedException e1) {} + continue; + } + if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage()); + return null; + } + @Override public void add(final SolrDoc solrdoc) throws IOException, SolrException { final long t = System.currentTimeMillis() + this.retryMaxTime; @@ -141,11 +157,11 @@ public class RetrySolrConnector implements SolrConnector { } @Override - public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException { + public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException { final long t = System.currentTimeMillis() + this.retryMaxTime; Throwable ee = null; while (System.currentTimeMillis() < t) try { - return this.solrConnector.get(querystring, offset, count); + return this.solrConnector.query(querystring, offset, count); } catch (final Throwable e) { ee = e; try {Thread.sleep(10);} catch (final InterruptedException e1) {} diff --git a/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java b/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java index 45edd8d04..e6675490b 100644 --- a/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java @@ -116,6 +116,15 @@ public class ShardSolrConnector implements SolrConnector { } return false; } + + @Override + public SolrDocument get(String id) throws IOException { + for (final SolrConnector connector: this.connectors) { + SolrDocument doc = connector.get(id); + if (doc != null) return doc; + } + return null; + } /** * add a Solr document @@ -148,10 +157,10 @@ public class ShardSolrConnector implements SolrConnector { * @throws IOException */ @Override - public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException { + public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException { final SolrDocumentList list = new SolrDocumentList(); for (final SolrConnector connector: this.connectors) { - final SolrDocumentList l = connector.get(querystring, offset, count); + final SolrDocumentList l = connector.query(querystring, offset, count); for (final SolrDocument d: l) { list.add(d); } @@ -163,7 +172,7 @@ public class ShardSolrConnector implements SolrConnector { final SolrDocumentList[] list = new SolrDocumentList[this.connectors.size()]; int i = 0; for (final SolrConnector connector: this.connectors) { - list[i++] = connector.get(querystring, offset, count); + list[i++] = connector.query(querystring, offset, count); } return list; } diff --git a/source/net/yacy/cora/services/federated/solr/SolrConnector.java b/source/net/yacy/cora/services/federated/solr/SolrConnector.java index a05e38d35..3d258d03e 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrConnector.java @@ -28,6 +28,7 @@ import java.io.IOException; import java.util.Collection; import java.util.List; +import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; @@ -87,13 +88,21 @@ public interface SolrConnector { public void add(final SolrDoc solrdoc) throws IOException, SolrException; public void add(final Collection solrdocs) throws IOException, SolrException; + /** + * get a document from solr by given id + * @param id + * @return one result or null if no result exists + * @throws IOException + */ + public SolrDocument get(final String id) throws IOException; + /** * get a query result from solr * to get all results set the query String to "*:*" * @param querystring * @throws IOException */ - public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException; + public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException; /** * get the size of the index diff --git a/source/net/yacy/kelondro/data/meta/URIMetadata.java b/source/net/yacy/kelondro/data/meta/URIMetadata.java index 7b771d1db..49dc6a0e7 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadata.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadata.java @@ -24,8 +24,8 @@ package net.yacy.kelondro.data.meta; import java.util.Date; +import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.order.Bitfield; -import net.yacy.kelondro.rwi.Reference; public interface URIMetadata extends URIReference { @@ -74,10 +74,12 @@ public interface URIMetadata extends URIReference { public String snippet(); - public Reference word(); + public WordReference word(); public boolean isOlder(final URIMetadata other); public String toString(final String snippet); + public byte[] referrerHash(); + } diff --git a/source/net/yacy/kelondro/data/meta/URIReference.java b/source/net/yacy/kelondro/data/meta/URIReference.java index d2ba476a3..0616f2689 100644 --- a/source/net/yacy/kelondro/data/meta/URIReference.java +++ b/source/net/yacy/kelondro/data/meta/URIReference.java @@ -35,6 +35,12 @@ public interface URIReference { */ public byte[] hash(); + /** + * the second half of a uri hash is the host hash + * @return + */ + public String hosthash(); + /** * The modification date of the URIReference is given if * the record was created first and is defined with the diff --git a/source/net/yacy/kelondro/data/meta/URIReferenceNode.java b/source/net/yacy/kelondro/data/meta/URIReferenceNode.java index bacdfe998..eab4cfdf4 100644 --- a/source/net/yacy/kelondro/data/meta/URIReferenceNode.java +++ b/source/net/yacy/kelondro/data/meta/URIReferenceNode.java @@ -49,6 +49,14 @@ public class URIReferenceNode extends HashMap implements URIRefe return this.hash; } + private String hostHash = null; + @Override + public String hosthash() { + if (this.hostHash != null) return this.hostHash; + this.hostHash = ASCII.String(this.hash, 6, 6); + return this.hostHash; + } + @Override public Date moddate() { byte[] x = this.get(MetadataVocabulary.moddate.name()); diff --git a/source/net/yacy/peers/PeerActions.java b/source/net/yacy/peers/PeerActions.java index db4096c69..609d0fd79 100644 --- a/source/net/yacy/peers/PeerActions.java +++ b/source/net/yacy/peers/PeerActions.java @@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.RSSMessage; -import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.MapTools; import net.yacy.peers.operation.yacyVersion; diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 552848693..f186d7857 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -77,6 +77,7 @@ import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.services.federated.opensearch.SRURSSConnector; import net.yacy.cora.services.federated.yacy.CacheStrategy; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -1155,7 +1156,7 @@ public final class Protocol public static String transferIndex( final Seed targetSeed, final ReferenceContainerCache indexes, - final SortedMap urlCache, + final SortedMap urlCache, final boolean gzipBody, final int timeout) { @@ -1216,7 +1217,7 @@ public final class Protocol } // all url's known // extract the urlCache from the result - final URIMetadataRow[] urls = new URIMetadataRow[uhs.length]; + final URIMetadata[] urls = new URIMetadataRow[uhs.length]; for ( int i = 0; i < uhs.length; i++ ) { urls[i] = urlCache.get(ASCII.getBytes(uhs[i])); if ( urls[i] == null ) { @@ -1324,7 +1325,7 @@ public final class Protocol private static Map transferURL( final Seed targetSeed, - final URIMetadataRow[] urls, + final URIMetadata[] urls, boolean gzipBody, final int timeout) { // this post a message to the remote message board @@ -1346,7 +1347,7 @@ public final class Protocol String resource; int urlc = 0; int urlPayloadSize = 0; - for ( final URIMetadataRow url : urls ) { + for ( final URIMetadata url : urls ) { if ( url != null ) { resource = url.toString(); //System.out.println("*** DEBUG resource = " + resource); diff --git a/source/net/yacy/peers/dht/Transmission.java b/source/net/yacy/peers/dht/Transmission.java index e67d1bbd1..5060b3240 100644 --- a/source/net/yacy/peers/dht/Transmission.java +++ b/source/net/yacy/peers/dht/Transmission.java @@ -32,7 +32,7 @@ import java.util.SortedMap; import java.util.TreeMap; import net.yacy.cora.document.ASCII; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -90,7 +90,7 @@ public class Transmission { */ private final byte[] primaryTarget; private final ReferenceContainerCache containers; - private final SortedMap references; + private final SortedMap references; private final HandleSet badReferences; private final List targets; private int hit, miss; @@ -106,7 +106,7 @@ public class Transmission { super(); this.primaryTarget = primaryTarget; this.containers = new ReferenceContainerCache(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength); - this.references = new TreeMap(Base64Order.enhancedCoder); + this.references = new TreeMap(Base64Order.enhancedCoder); this.badReferences = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); this.targets = targets; this.hit = 0; @@ -175,7 +175,7 @@ public class Transmission { notFoundx.add(e.urlhash()); continue; } - final URIMetadataRow r = Transmission.this.segment.urlMetadata().load(e.urlhash()); + final URIMetadata r = Transmission.this.segment.urlMetadata().load(e.urlhash()); if (r == null) { notFoundx.add(e.urlhash()); this.badReferences.put(e.urlhash()); diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index c17724fbb..cfc583048 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -45,6 +45,7 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; @@ -332,7 +333,7 @@ public class Blacklist { * @param entry Entry to be checked * @return Whether the given entry is blacklisted */ - public boolean isListed(final BlacklistType blacklistType, final URIMetadataRow entry) { + public boolean isListed(final BlacklistType blacklistType, final URIMetadata entry) { // Call inner method return isListed(blacklistType, entry.url()); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 7bcf00a5c..746565fdc 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -111,6 +111,7 @@ import net.yacy.document.parser.html.Evaluation; import net.yacy.gui.Tray; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.HandleSet; @@ -391,8 +392,12 @@ public final class Switchboard extends serverSwitch fileSizeMax, this.useTailCache, this.exceed134217727, - solrLocal); - + solrLocal, + true, // useCitationIndex + true, // useRWI + true // useMetadata + ); + // prepare a solr index profile switch list final File solrBackupProfile = new File("defaults/solr.keys.list"); final String schemename = @@ -1197,7 +1202,11 @@ public final class Switchboard extends serverSwitch fileSizeMax, this.useTailCache, this.exceed134217727, - solrLocal); + solrLocal, + true, // useCitationIndex + true, // useRWI + true // useMetadata + ); this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object // create a crawler @@ -1447,7 +1456,7 @@ public final class Switchboard extends serverSwitch if ( urlhash.length == 0 ) { return null; } - final URIMetadataRow le = this.index.urlMetadata().load(urlhash); + final URIMetadata le = this.index.urlMetadata().load(urlhash); if ( le != null ) { return le.url(); } diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 2be41ccb9..14224f0f5 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -41,6 +41,7 @@ import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.search.query.QueryParams; @@ -74,7 +75,19 @@ public class DocumentIndex extends Segment public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize) throws IOException { - super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false, true); + super( + new Log("DocumentIndex"), + segmentPath, + cachesize, + targetFileSize * 4 - 1, + false, // useTailCache + false, // exceed134217727 + true, // connectLocalSolr + true, // useCitationIndex + true, // useRWI + true // useMetadata + ); + final int cores = Runtime.getRuntime().availableProcessors() + 1; this.callback = callback; this.queue = new LinkedBlockingQueue(cores * 300); @@ -227,7 +240,7 @@ public class DocumentIndex extends Segment rankedCache.start(); // search is running; retrieve results - URIMetadataRow row; + URIMetadata row; final ArrayList files = new ArrayList(); while ( (row = rankedCache.takeURL(false, 1000)) != null ) { files.add(row.url()); diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java index f9faa57ad..1c1acf672 100644 --- a/source/net/yacy/search/index/MetadataRepository.java +++ b/source/net/yacy/search/index/MetadataRepository.java @@ -49,6 +49,7 @@ import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.index.Cache; @@ -65,43 +66,38 @@ import net.yacy.search.Switchboard; import net.yacy.search.solr.EmbeddedSolrConnector; import org.apache.lucene.util.Version; - import de.anomic.crawler.CrawlStacker; public final class MetadataRepository implements /*Metadata,*/ Iterable { // class objects - protected Index urlIndexFile; + private final File location; + private Index urlIndexFile; private Export exportthread; // will have a export thread assigned if exporter is running - private final File location; - private final String tablename; + private String tablename; private ArrayList statsDump; private SolrConnector localSolr, remoteSolr; - public MetadataRepository( - final File path, - final String tablename, - final boolean useTailCache, - final boolean exceed134217727) { + public MetadataRepository(final File path) { this.location = path; - this.tablename = tablename; - Index backupIndex = null; - backupIndex = new SplitTable(this.location, tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727); - this.urlIndexFile = backupIndex; //new Cache(backupIndex, 20000000, 20000000); + this.tablename = null; + this.urlIndexFile = null; this.exportthread = null; // will have a export thread assigned if exporter is running this.statsDump = null; this.remoteSolr = null; this.localSolr = null; } - - public void connectRemoteSolr(final SolrConnector solr) { - this.remoteSolr = solr; + + public void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) { + if (this.urlIndexFile != null) return; + this.tablename = tablename; + this.urlIndexFile = new SplitTable(this.location, tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727); } - public void disconnectRemoteSolr() { - if (this.remoteSolr == null) return; - this.remoteSolr.close(); - this.remoteSolr = null; + public void disconnectUrlDb() { + if (this.urlIndexFile == null) return; + this.urlIndexFile.close(); + this.urlIndexFile = null; } public void connectLocalSolr() throws IOException { @@ -123,6 +119,16 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable this.localSolr.close(); this.localSolr = null; } + + public void connectRemoteSolr(final SolrConnector solr) { + this.remoteSolr = solr; + } + + public void disconnectRemoteSolr() { + if (this.remoteSolr == null) return; + this.remoteSolr.close(); + this.remoteSolr = null; + } public SolrConnector getLocalSolr() { return this.localSolr; @@ -133,7 +139,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable } public void clearCache() { - if (this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache(); + if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache(); if (this.statsDump != null) this.statsDump.clear(); this.statsDump = null; } @@ -142,15 +148,22 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable if (this.exportthread != null) this.exportthread.interrupt(); if (this.urlIndexFile == null) { SplitTable.delete(this.location, this.tablename); - this.urlIndexFile = new SplitTable(this.location, this.tablename, URIMetadataRow.rowdef, false, false); } else { this.urlIndexFile.clear(); } + if (this.localSolr != null) { + this.localSolr.clear(); + } + // the remote solr is not cleared here because that shall be done separately this.statsDump = null; } public int size() { - return this.urlIndexFile == null ? 0 : this.urlIndexFile.size(); + int size = 0; + size += this.urlIndexFile == null ? 0 : this.urlIndexFile.size(); + size += this.localSolr == null ? 0 : this.localSolr.getSize(); + size += this.remoteSolr == null ? 0 : this.remoteSolr.getSize(); + return size; } public void close() { @@ -170,8 +183,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable } public int writeCacheSize() { - if (this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize(); - if (this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize(); + if (this.urlIndexFile != null && this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize(); + if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize(); return 0; } @@ -181,59 +194,69 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable * @param obrwi * @return */ - public URIMetadataRow load(final WeakPriorityBlockingQueue.Element obrwi) { - if (this.urlIndexFile == null) return null; + public URIMetadata load(final WeakPriorityBlockingQueue.Element obrwi) { if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element final byte[] urlHash = obrwi.getElement().urlhash(); if (urlHash == null) return null; - try { + if (this.urlIndexFile != null) try { final Row.Entry entry = this.urlIndexFile.get(urlHash, false); if (entry == null) return null; return new URIMetadataRow(entry, obrwi.getElement(), obrwi.getWeight()); } catch (final IOException e) { - return null; + Log.logException(e); + } + /* + if (this.localSolr != null) { + try { + SolrDocument doc = this.localSolr.get(ASCII.String(urlHash)); + } catch (IOException e) { + Log.logException(e); + } } + */ + return null; } - public URIMetadataRow load(final byte[] urlHash) { - if (this.urlIndexFile == null) return null; + public URIMetadata load(final byte[] urlHash) { if (urlHash == null) return null; - try { + if (this.urlIndexFile != null) try { final Row.Entry entry = this.urlIndexFile.get(urlHash, false); if (entry == null) return null; return new URIMetadataRow(entry, null, 0); } catch (final IOException e) { return null; } + return null; } - public void store(final URIMetadataRow entry) throws IOException { + public void store(final URIMetadata entry) throws IOException { // Check if there is a more recent Entry already in the DB - URIMetadataRow oldEntry; - if (this.urlIndexFile == null) return; // case may happen during shutdown or startup - try { - final Row.Entry oe = this.urlIndexFile.get(entry.hash(), false); - oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0); - } catch (final Exception e) { - Log.logException(e); - oldEntry = null; + if (this.urlIndexFile != null && entry instanceof URIMetadataRow) { + URIMetadata oldEntry = null; + try { + final Row.Entry oe = this.urlIndexFile.get(entry.hash(), false); + oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0); + } catch (final Exception e) { + Log.logException(e); + oldEntry = null; + } + if (oldEntry != null && entry.isOlder(oldEntry)) { + // the fetched oldEntry is better, so return its properties instead of the new ones + // this.urlHash = oldEntry.urlHash; // unnecessary, should be the same + // this.url = oldEntry.url; // unnecessary, should be the same + // doesn't make sense, since no return value: + //entry = oldEntry; + return; // this did not need to be stored, but is updated + } + + try { + this.urlIndexFile.put(((URIMetadataRow) entry).toRowEntry()); + } catch (final RowSpaceExceededException e) { + throw new IOException("RowSpaceExceededException in " + this.urlIndexFile.filename() + ": " + e.getMessage()); + } + this.statsDump = null; + if (MemoryControl.shortStatus()) clearCache(); } - if (oldEntry != null && entry.isOlder(oldEntry)) { - // the fetched oldEntry is better, so return its properties instead of the new ones - // this.urlHash = oldEntry.urlHash; // unnecessary, should be the same - // this.url = oldEntry.url; // unnecessary, should be the same - // doesn't make sense, since no return value: - //entry = oldEntry; - return; // this did not need to be stored, but is updated - } - - try { - this.urlIndexFile.put(entry.toRowEntry()); - } catch (final RowSpaceExceededException e) { - throw new IOException("RowSpaceExceededException in " + this.urlIndexFile.filename() + ": " + e.getMessage()); - } - this.statsDump = null; - if (MemoryControl.shortStatus()) clearCache() ; } public boolean remove(final byte[] urlHash) { @@ -251,13 +274,14 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable Log.logException(e); } } - try { + if (this.urlIndexFile != null) try { final Row.Entry r = this.urlIndexFile.remove(urlHash); if (r != null) this.statsDump = null; return r != null; } catch (final IOException e) { return false; } + return false; } public boolean exists(final byte[] urlHash) { @@ -297,17 +321,17 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable return keys(true, null); } - public CloneableIterator entries() throws IOException { + public CloneableIterator entries() throws IOException { // enumerates entry elements return new kiter(); } - public CloneableIterator entries(final boolean up, final String firstHash) throws IOException { + public CloneableIterator entries(final boolean up, final String firstHash) throws IOException { // enumerates entry elements return new kiter(up, firstHash); } - public class kiter implements CloneableIterator { + public class kiter implements CloneableIterator { // enumerates entry elements private final CloneableIterator iter; private final boolean error; @@ -342,7 +366,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable } @Override - public final URIMetadataRow next() { + public final URIMetadata next() { Row.Entry e = null; if (this.iter == null) { return null; } if (this.iter.hasNext()) { e = this.iter.next(); } @@ -372,7 +396,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable final Log log = new Log("URLDBCLEANUP"); final HashSet damagedURLS = new HashSet(); try { - final Iterator eiter = entries(true, null); + final Iterator eiter = entries(true, null); int iteratorCount = 0; while (eiter.hasNext()) try { eiter.next(); @@ -456,7 +480,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable public void run() { try { Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet"); - final Iterator eiter = entries(true, null); + final Iterator eiter = entries(true, null); while (eiter.hasNext() && this.run) { synchronized (this) { if (this.pause) { @@ -469,7 +493,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable } } } - final URIMetadataRow entry = eiter.next(); + final URIMetadata entry = eiter.next(); if (entry == null) { if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null"); } else if (entry.hash() == null) { @@ -605,8 +629,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable this.count++; } } else { - final Iterator i = entries(); // iterates indexURLEntry objects - URIMetadataRow entry; + final Iterator i = entries(); // iterates indexURLEntry objects + URIMetadata entry; String url; while (i.hasNext()) { entry = i.next(); @@ -704,7 +728,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable // collect hashes from all domains // fetch urls from the database to determine the host in clear text - URIMetadataRow urlref; + URIMetadata urlref; if (count < 0 || count > domainSamples.size()) count = domainSamples.size(); this.statsDump = new ArrayList(); final TreeSet set = new TreeSet(); @@ -741,7 +765,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable */ public Map domainHashResolver(final Map domainSamples) { final HashMap hostMap = new HashMap(); - URIMetadataRow urlref; + URIMetadata urlref; final ScoreMap hosthashScore = new ConcurrentScoreMap(); for (final Map.Entry e: domainSamples.entrySet()) { @@ -762,7 +786,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable // fetch urls from the database to determine the host in clear text final Iterator j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first) - URIMetadataRow urlref; + URIMetadata urlref; String urlhash; count += 10; // make some more to prevent that we have to do this again after deletions too soon. if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size(); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 437f18d33..44a6800ff 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -47,6 +47,7 @@ import net.yacy.document.Parser; import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReferenceFactory; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -88,7 +89,8 @@ public class Segment { public static final int lowcachedivisor = 900; public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB public static final int writeBufferSize = 4 * 1024 * 1024; - + public static final String UrlDbName = "text.urlmd"; + // the reference factory public static final ReferenceFactory wordReferenceFactory = new WordReferenceFactory(); public static final ReferenceFactory citationReferenceFactory = new CitationReferenceFactory(); @@ -109,14 +111,17 @@ public class Segment { final long maxFileSize, final boolean useTailCache, final boolean exceed134217727, - final boolean connectLocalSolr) throws IOException { + final boolean connectLocalSolr, + final boolean useCitationIndex, + final boolean useRWI, + final boolean useMetadata) throws IOException { log.logInfo("Initializing Segment '" + segmentPath + "."); this.log = log; this.segmentPath = segmentPath; - this.termIndex = new IndexCell( + this.termIndex = useRWI ? new IndexCell( segmentPath, "text.index", wordReferenceFactory, @@ -125,9 +130,9 @@ public class Segment { entityCacheMaxSize, targetFileSize, maxFileSize, - writeBufferSize); + writeBufferSize) : null; - this.urlCitationIndex = new IndexCell( + this.urlCitationIndex = useCitationIndex ? new IndexCell( segmentPath, "citation.index", citationReferenceFactory, @@ -136,10 +141,11 @@ public class Segment { entityCacheMaxSize, targetFileSize, maxFileSize, - writeBufferSize); + writeBufferSize) : null; // create LURL-db - this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727); + this.urlMetadata = new MetadataRepository(segmentPath); + if (useMetadata) this.urlMetadata.connectUrlDb(UrlDbName, useTailCache, exceed134217727); if (connectLocalSolr) this.connectLocalSolr(); } @@ -148,10 +154,12 @@ public class Segment { } public long RWICount() { + if (this.termIndex == null) return 0; return this.termIndex.sizesMax(); } public int RWIBufferCount() { + if (this.termIndex == null) return 0; return this.termIndex.getBufferSize(); } @@ -235,7 +243,7 @@ public class Segment { } @Override public DigestURI next() { - URIMetadataRow umr = Segment.this.urlMetadata.load(bi.next()); + URIMetadata umr = Segment.this.urlMetadata.load(bi.next()); return umr.url(); } @Override @@ -260,9 +268,9 @@ public class Segment { public void clear() { try { - this.termIndex.clear(); - this.urlMetadata.clear(); - this.urlCitationIndex.clear(); + if (this.termIndex != null) this.termIndex.clear(); + if (this.urlMetadata != null) this.urlMetadata.clear(); + if (this.urlCitationIndex != null) this.urlCitationIndex.clear(); } catch (final IOException e) { Log.logException(e); } @@ -328,7 +336,7 @@ public class Segment { assert (wprop.flags != null); ientry.setWord(wprop); wordhash = Word.word2hash(word); - try { + if (this.termIndex != null) try { this.termIndex.add(wordhash, ientry); } catch (final Exception e) { Log.logException(e); @@ -354,7 +362,7 @@ public class Segment { // assign the catchall word ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics - try { + if (this.termIndex != null) try { this.termIndex.add(catchallHash, ientry); } catch (final Exception e) { Log.logException(e); @@ -385,9 +393,9 @@ public class Segment { } public synchronized void close() { - this.termIndex.close(); - this.urlMetadata.close(); - this.urlCitationIndex.close(); + if (this.termIndex != null) this.termIndex.close(); + if (this.urlMetadata != null) this.urlMetadata.close(); + if (this.urlCitationIndex != null) this.urlCitationIndex.close(); } public URIMetadataRow storeDocument( @@ -541,7 +549,7 @@ public class Segment { if (urlhash == null) return 0; // determine the url string - final URIMetadataRow entry = urlMetadata().load(urlhash); + final URIMetadata entry = urlMetadata().load(urlhash); if (entry == null) return 0; if (entry.url() == null) return 0; @@ -612,7 +620,7 @@ public class Segment { entry = new WordReferenceVars(containerIterator.next()); // System.out.println("Wordhash: "+wordHash+" UrlHash: // "+entry.getUrlHash()); - final URIMetadataRow ue = Segment.this.urlMetadata.load(entry.urlhash()); + final URIMetadata ue = Segment.this.urlMetadata.load(entry.urlhash()); if (ue == null) { urlHashs.put(entry.urlhash()); } else { diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java index b7ec216c6..54ddf747b 100644 --- a/source/net/yacy/search/query/RWIProcess.java +++ b/source/net/yacy/search/query/RWIProcess.java @@ -55,6 +55,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement; import net.yacy.document.Condenser; import net.yacy.document.LibraryProvider; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; @@ -616,7 +617,7 @@ public final class RWIProcess extends Thread * @param waitingtime the time this method may take for a result computation * @return a metadata entry for a url */ - public URIMetadataRow takeURL(final boolean skipDoubleDom, final long waitingtime) { + public URIMetadata takeURL(final boolean skipDoubleDom, final long waitingtime) { // returns from the current RWI list the best URL entry and removes this entry from the list final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime); int p = -1; @@ -627,7 +628,7 @@ public final class RWIProcess extends Thread if ( obrwi == null ) { return null; // all time was already wasted in takeRWI to get another element } - final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi); + final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi); if ( page == null ) { try { this.misses.putUnique(obrwi.getElement().urlhash()); @@ -864,7 +865,7 @@ public final class RWIProcess extends Thread } final Iterator domhashs = this.hostNavigator.keys(false); - URIMetadataRow row; + URIMetadata row; byte[] urlhash; String hosthash, hostname; if ( this.hostResolver != null ) { diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index 80a255115..21058971c 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -41,7 +41,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element; import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement; import net.yacy.document.Condenser; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; @@ -454,7 +454,7 @@ public class SnippetProcess { public void run() { // start fetching urls and snippets - URIMetadataRow page; + URIMetadata page; ResultEntry resultEntry; //final int fetchAhead = snippetMode == 0 ? 0 : 10; final boolean nav_topics = SnippetProcess.this.query.navigators.equals("all") || SnippetProcess.this.query.navigators.indexOf("topics",0) >= 0; @@ -498,7 +498,7 @@ public class SnippetProcess { String solrContent = null; if (this.solr != null) { SolrDocument sd = null; - final SolrDocumentList sdl = this.solr.get(SolrField.id.getSolrFieldName()+ ":" + ASCII.String(page.hash()), 0, 1); + final SolrDocumentList sdl = this.solr.query(SolrField.id.getSolrFieldName()+ ":" + ASCII.String(page.hash()), 0, 1); if (!sdl.isEmpty()) { sd = sdl.get(0); } @@ -553,7 +553,7 @@ public class SnippetProcess { } } - protected ResultEntry fetchSnippet(final URIMetadataRow page, final String solrText, final CacheStrategy cacheStrategy) { + protected ResultEntry fetchSnippet(final URIMetadata page, final String solrText, final CacheStrategy cacheStrategy) { // Snippet Fetching can has 3 modes: // 0 - do not fetch snippets // 1 - fetch snippets offline only diff --git a/source/net/yacy/search/snippet/ResultEntry.java b/source/net/yacy/search/snippet/ResultEntry.java index f6adde6dc..126f58c41 100644 --- a/source/net/yacy/search/snippet/ResultEntry.java +++ b/source/net/yacy/search/snippet/ResultEntry.java @@ -34,7 +34,7 @@ import java.util.List; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.logging.Log; @@ -50,7 +50,7 @@ import net.yacy.search.index.Segment; public class ResultEntry implements Comparable, Comparator { // payload objects - private final URIMetadataRow urlentry; + private final URIMetadata urlentry; private String alternative_urlstring; private String alternative_urlname; private final TextSnippet textSnippet; @@ -60,7 +60,7 @@ public class ResultEntry implements Comparable, Comparator, Comparator> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false); long urlCounter = 0, wordCounter = 0; @@ -689,7 +697,7 @@ public final class yacy { iEntry = wordIdxEntries.next(); final byte[] urlHash = iEntry.urlhash(); if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { - final URIMetadataRow urlEntry = currentUrlDB.load(urlHash); + final URIMetadata urlEntry = currentUrlDB.load(urlHash); urlCounter++; minimizedUrlDB.store(urlEntry); if (urlCounter % 500 == 0) { @@ -829,7 +837,8 @@ public final class yacy { final File root = dataHome; final File indexroot = new File(root, "DATA/INDEX"); try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {} - final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"), "text.urlmd", false, false); + final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT")); + currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false); currentUrlDB.deadlinkCleaner(); currentUrlDB.close(); } @@ -849,7 +858,14 @@ public final class yacy { log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), 10000, - Integer.MAX_VALUE, false, false, false); + Integer.MAX_VALUE, + false, // useTailCache + false, // exceed134217727 + false, // connectLocalSolr + false, // useCitationIndex + true, // useRWI + true // useMetadata + ); indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false); } int counter = 0;