- more abstraction for the RWI index as preparation for solr integration

- added options in search index to switch parts of the index on or off
pull/1/head
orbiter 13 years ago
parent 6cc5d1094e
commit 69e743d9e3

@ -46,7 +46,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.NewsPool;
import net.yacy.search.Switchboard;
@ -194,7 +194,7 @@ public class Bookmarks {
final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
final URIMetadataRow urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash));
final URIMetadata urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash));
if (urlentry != null) try {
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null));
prop.put("mode_edit", "0"); // create mode

@ -35,7 +35,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
@ -178,7 +178,7 @@ public class CrawlResults {
boolean dark = true;
String urlstr, urltxt;
Seed initiatorSeed, executorSeed;
URIMetadataRow urle;
URIMetadata urle;
int cnt = 0;
final Iterator<Map.Entry<String, InitExecEntry>> i = ResultURLs.results(tabletype);

@ -33,10 +33,10 @@
<dt class="TableCellDark">Index Deletion</dt>
<dd><input type="checkbox" name="deleteIndex" id="deleteIndex"
onclick="x=document.getElementById('deleteIndex').checked;document.getElementById('deleteTriplestore').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deleteTriplestore').disabled=c;document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
/><label for="deleteIndex">Delete Search Index</label><br/>
#(solr)#::<input type="checkbox" name="deleteSolr" id="deleteSolr"
onclick="x=document.getElementById('deleteSolr').checked;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
/><label for="deleteSolr">Delete Solr Index</label><br/>#(/solr)#
/><label for="deleteIndex">Delete local search index (including local solr)</label><br/>
#(solr)#::<input type="checkbox" name="deleteRemoteSolr" id="deleteRemoteSolr"
onclick="x=document.getElementById('deleteRemoteSolr').checked;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';}document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
/><label for="deleteRemoteSolr">Delete remote solr index</label><br/>#(/solr)#
<input type="checkbox" name="deleteTriplestore" id="deleteTriplestore" disabled="disabled" /><label for="deleteTriplestore">Delete RDF Triplestore</label><br/>
<input type="checkbox" name="deleteCache" id="deleteCache" disabled="disabled" /><label for="deleteCache">Delete HTTP &amp; FTP Cache</label><br/>
<input type="checkbox" name="deleteCrawlQueues" id="deleteCrawlQueues" disabled="disabled" /><label for="deleteCrawlQueues">Stop Crawler and delete Crawl Queues</label><br/>

@ -42,6 +42,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
@ -156,7 +157,7 @@ public class IndexControlRWIs_p {
if ( post.get("deleteIndex", "").equals("on") ) {
segment.clear();
}
if ( post.get("deleteSolr", "").equals("on") && sb.index.getRemoteSolr() != null) {
if ( post.get("deleteRemoteSolr", "").equals("on") && sb.index.getRemoteSolr() != null) {
try {
sb.index.getRemoteSolr().clear();
} catch ( final Exception e ) {
@ -307,15 +308,15 @@ public class IndexControlRWIs_p {
index = segment.termIndex().get(keyhash, null);
// built urlCache
final Iterator<WordReference> urlIter = index.entries();
final TreeMap<byte[], URIMetadataRow> knownURLs =
new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
final TreeMap<byte[], URIMetadata> knownURLs =
new TreeMap<byte[], URIMetadata>(Base64Order.enhancedCoder);
final HandleSet unknownURLEntries =
new HandleSet(
WordReferenceRow.urlEntryRow.primaryKeyLength,
WordReferenceRow.urlEntryRow.objectOrder,
index.size());
Reference iEntry;
URIMetadataRow lurl;
URIMetadata lurl;
while (urlIter.hasNext()) {
iEntry = urlIter.next();
lurl = segment.urlMetadata().load(iEntry.urlhash());
@ -413,7 +414,7 @@ public class IndexControlRWIs_p {
} catch ( final RowSpaceExceededException e ) {
Log.logException(e);
}
final URIMetadataRow e = segment.urlMetadata().load(b);
final URIMetadata e = segment.urlMetadata().load(b);
segment.urlMetadata().remove(b);
if ( e != null ) {
url = e.url();
@ -448,7 +449,7 @@ public class IndexControlRWIs_p {
} catch ( final RowSpaceExceededException e ) {
Log.logException(e);
}
final URIMetadataRow e = segment.urlMetadata().load(b);
final URIMetadata e = segment.urlMetadata().load(b);
segment.urlMetadata().remove(b);
if ( e != null ) {
url = e.url();
@ -514,7 +515,7 @@ public class IndexControlRWIs_p {
prop.put("genUrlList_lines", maxlines);
int i = 0;
DigestURI url;
URIMetadataRow entry;
URIMetadata entry;
String us;
long rn = -1;
while ( !ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null ) {

@ -35,7 +35,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
@ -132,7 +132,7 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashdelete")) {
final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
@ -166,7 +166,7 @@ public class IndexControlURLs_p {
final DigestURI url = new DigestURI(urlstring);
urlhash = ASCII.String(url.hash());
prop.put("urlhash", urlhash);
final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
if (entry == null) {
prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true));
prop.putHTML("urlstring", urlstring);
@ -184,7 +184,7 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashsearch")) {
final URIMetadataRow entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
final URIMetadata entry = segment.urlMetadata().load(ASCII.getBytes(urlhash));
if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash);
} else {
@ -199,9 +199,9 @@ public class IndexControlURLs_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
try {
final Iterator<URIMetadataRow> entryIt = new RotateIterator<URIMetadataRow>(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
final Iterator<URIMetadata> entryIt = new RotateIterator<URIMetadata>(segment.urlMetadata().entries(true, urlhash), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
URIMetadataRow entry;
URIMetadata entry;
int i = 0, rows = 0, cols = 0;
prop.put("urlhashsimilar", "1");
while (entryIt.hasNext() && i < 256) {
@ -303,14 +303,14 @@ public class IndexControlURLs_p {
return prop;
}
private static serverObjects genUrlProfile(final Segment segment, final URIMetadataRow entry, final String urlhash) {
private static serverObjects genUrlProfile(final Segment segment, final URIMetadata entry, final String urlhash) {
final serverObjects prop = new serverObjects();
if (entry == null) {
prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash);
return prop;
}
final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
if (entry.url() == null) {
prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash);

@ -51,7 +51,7 @@ import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
@ -112,7 +112,7 @@ public class ViewFile {
// get the url hash from which the content should be loaded
String urlHash = post.get("urlHash", "");
URIMetadataRow urlEntry = null;
URIMetadata urlEntry = null;
// get the urlEntry that belongs to the url hash
if (urlHash.length() > 0 && (urlEntry = indexSegment.urlMetadata().load(ASCII.getBytes(urlHash))) != null) {
// get the url that belongs to the entry

@ -35,7 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
@ -86,12 +86,12 @@ public class Vocabulary_p {
if (p >= 0) t = t.substring(p + 1);
}
if (discoverFromTitle || discoverFromTitleSplitted) {
URIMetadataRow m = segment.urlMetadata().load(u.hash());
URIMetadata m = segment.urlMetadata().load(u.hash());
if (m != null) t = m.dc_title();
if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
}
if (discoverFromAuthor) {
URIMetadataRow m = segment.urlMetadata().load(u.hash());
URIMetadata m = segment.urlMetadata().load(u.hash());
if (m != null) t = m.dc_creator();
}
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();

@ -35,7 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.RequestHeader.FileType;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
@ -97,13 +97,13 @@ public class yacydoc {
}
if (urlhash == null || urlhash.isEmpty()) return prop;
final URIMetadataRow entry = segment.urlMetadata().load(urlhash.getBytes());
final URIMetadata entry = segment.urlMetadata().load(urlhash.getBytes());
if (entry == null) return prop;
if (entry.url() == null) {
return prop;
}
final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
prop.putXML("dc_title", entry.dc_title());
prop.putXML("dc_creator", entry.dc_creator());

@ -31,7 +31,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.peers.Protocol;
import net.yacy.search.Switchboard;
import de.anomic.crawler.NoticedURL;
@ -110,7 +110,7 @@ public class urls {
if (urlhashes.length() % 12 != 0) return prop;
final int count = urlhashes.length() / 12;
int c = 0;
URIMetadataRow entry;
URIMetadata entry;
DigestURI referrer;
for (int i = 0; i < count; i++) {
entry = sb.index.urlMetadata().load(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));

@ -57,7 +57,7 @@ import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser;
import net.yacy.document.geolocation.GeoLocation;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.logging.Log;
@ -660,7 +660,7 @@ public class yacysearch {
return prop;
}
final String recommendHash = post.get("recommendref", ""); // urlhash
final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash));
final URIMetadata urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash));
if ( urlentry != null ) {
Document[] documents = null;
try {
@ -696,7 +696,7 @@ public class yacysearch {
return prop;
}
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(bookmarkHash));
final URIMetadata urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(bookmarkHash));
if ( urlentry != null ) {
try {
sb.tables.bookmarks.createBookmark(

@ -45,7 +45,7 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.workflow.WorkflowProcessor;
@ -439,7 +439,7 @@ public final class CrawlStacker {
// check if the url is double registered
final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
final URIMetadataRow oldEntry = this.indexSegment.urlMetadata().load(url.hash());
final URIMetadata oldEntry = this.indexSegment.urlMetadata().load(url.hash());
if (oldEntry == null) {
if (dbocc != null) {
// do double-check

@ -32,7 +32,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.document.parser.sitemapParser;
import net.yacy.document.parser.sitemapParser.URLEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import de.anomic.crawler.retrieval.Request;
@ -84,7 +84,7 @@ public class SitemapImporter extends Thread {
final String dbocc = this.sb.urlExists(nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date
final URIMetadataRow oldEntry = this.sb.index.urlMetadata().load(nexturlhash);
final URIMetadata oldEntry = this.sb.index.urlMetadata().load(nexturlhash);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null

@ -419,7 +419,8 @@ public class URLAnalysis {
public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException {
System.out.println("INDEX DIFF URL-COL startup");
final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile));
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
mr.connectUrlDb(Segment.UrlDbName, false, false);
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000);
System.out.println("INDEX DIFF URL-COL loaded dump, starting diff");
final long start = System.currentTimeMillis();
@ -447,7 +448,8 @@ public class URLAnalysis {
public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException {
// format: 0=text, 1=html, 2=rss/xml
System.out.println("URL EXPORT startup");
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
mr.connectUrlDb(Segment.UrlDbName, false, false);
final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
System.out.println("URL EXPORT loaded dump, starting export");
final Export e = mr.export(new File(export), ".*", hs, format, false);
@ -461,7 +463,8 @@ public class URLAnalysis {
public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException {
System.out.println("URL DELETE startup");
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
mr.connectUrlDb(Segment.UrlDbName, false, false);
final int mrSize = mr.size();
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);

@ -36,7 +36,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Segment;
import de.anomic.crawler.retrieval.Response;
@ -105,7 +105,7 @@ public class YMarkMetadata {
public EnumMap<METADATA, String> getMetadata() {
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
final URIMetadataRow urlEntry = this.indexSegment.urlMetadata().load(this.uri.hash());
final URIMetadata urlEntry = this.indexSegment.urlMetadata().load(this.uri.hash());
if (urlEntry != null) {
metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size()));
metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate()));

@ -34,6 +34,7 @@ import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
@ -89,7 +90,7 @@ public class AbstractSolrConnector implements SolrConnector {
@Override
public long getSize() {
try {
final SolrDocumentList list = get("*:*", 0, 1);
final SolrDocumentList list = query("*:*", 0, 1);
return list.getNumFound();
} catch (final Throwable e) {
Log.logException(e);
@ -132,8 +133,8 @@ public class AbstractSolrConnector implements SolrConnector {
@Override
public boolean exists(final String id) throws IOException {
try {
final SolrDocumentList list = get(SolrField.id.getSolrFieldName() + ":" + id, 0, 1);
return list.getNumFound() > 0;
final SolrDocument doc = get(id);
return doc != null;
} catch (final Throwable e) {
Log.logException(e);
return false;
@ -186,7 +187,7 @@ public class AbstractSolrConnector implements SolrConnector {
* @throws IOException
*/
@Override
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException {
public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
// construct query
final SolrQuery query = new SolrQuery();
query.setQuery(querystring);
@ -209,8 +210,33 @@ public class AbstractSolrConnector implements SolrConnector {
} catch (final Throwable e) {
throw new IOException(e);
}
}
/**
* get a document from solr by given id
* @param id
* @return one result or null if no result exists
* @throws IOException
*/
@Override
public SolrDocument get(final String id) throws IOException {
// construct query
StringBuffer sb = new StringBuffer(id.length() + 3);
sb.append(SolrField.id.getSolrFieldName()).append(':').append(id);
final SolrQuery query = new SolrQuery();
query.setQuery(sb.toString());
query.setRows(1);
query.setStart(0);
//return result;
// query the server
try {
final QueryResponse rsp = this.server.query( query );
final SolrDocumentList docs = rsp.getResults();
if (docs.isEmpty()) return null;
return docs.get(0);
} catch (final Throwable e) {
throw new IOException(e);
}
}
}

@ -5,6 +5,7 @@ import java.util.Collection;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
@ -111,6 +112,11 @@ public class MultipleSolrConnector implements SolrConnector {
return this.solr.exists(id);
}
@Override
public SolrDocument get(String id) throws IOException {
return this.solr.get(id);
}
@Override
public void add(final SolrDoc solrdoc) throws IOException, SolrException {
try {
@ -132,8 +138,8 @@ public class MultipleSolrConnector implements SolrConnector {
}
@Override
public SolrDocumentList get(String querystring, int offset, int count) throws IOException {
return this.solr.get(querystring, offset, count);
public SolrDocumentList query(String querystring, int offset, int count) throws IOException {
return this.solr.query(querystring, offset, count);
}
@Override

@ -28,6 +28,7 @@ import java.io.IOException;
import java.util.Collection;
import java.util.List;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
@ -120,6 +121,21 @@ public class RetrySolrConnector implements SolrConnector {
return false;
}
@Override
public SolrDocument get(String id) throws IOException {
final long t = System.currentTimeMillis() + this.retryMaxTime;
Throwable ee = null;
while (System.currentTimeMillis() < t) try {
return this.solrConnector.get(id);
} catch (final Throwable e) {
ee = e;
try {Thread.sleep(10);} catch (final InterruptedException e1) {}
continue;
}
if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
return null;
}
@Override
public void add(final SolrDoc solrdoc) throws IOException, SolrException {
final long t = System.currentTimeMillis() + this.retryMaxTime;
@ -141,11 +157,11 @@ public class RetrySolrConnector implements SolrConnector {
}
@Override
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException {
public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
final long t = System.currentTimeMillis() + this.retryMaxTime;
Throwable ee = null;
while (System.currentTimeMillis() < t) try {
return this.solrConnector.get(querystring, offset, count);
return this.solrConnector.query(querystring, offset, count);
} catch (final Throwable e) {
ee = e;
try {Thread.sleep(10);} catch (final InterruptedException e1) {}

@ -116,6 +116,15 @@ public class ShardSolrConnector implements SolrConnector {
}
return false;
}
@Override
public SolrDocument get(String id) throws IOException {
for (final SolrConnector connector: this.connectors) {
SolrDocument doc = connector.get(id);
if (doc != null) return doc;
}
return null;
}
/**
* add a Solr document
@ -148,10 +157,10 @@ public class ShardSolrConnector implements SolrConnector {
* @throws IOException
*/
@Override
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException {
public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
final SolrDocumentList list = new SolrDocumentList();
for (final SolrConnector connector: this.connectors) {
final SolrDocumentList l = connector.get(querystring, offset, count);
final SolrDocumentList l = connector.query(querystring, offset, count);
for (final SolrDocument d: l) {
list.add(d);
}
@ -163,7 +172,7 @@ public class ShardSolrConnector implements SolrConnector {
final SolrDocumentList[] list = new SolrDocumentList[this.connectors.size()];
int i = 0;
for (final SolrConnector connector: this.connectors) {
list[i++] = connector.get(querystring, offset, count);
list[i++] = connector.query(querystring, offset, count);
}
return list;
}

@ -28,6 +28,7 @@ import java.io.IOException;
import java.util.Collection;
import java.util.List;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
@ -87,13 +88,21 @@ public interface SolrConnector {
public void add(final SolrDoc solrdoc) throws IOException, SolrException;
public void add(final Collection<SolrDoc> solrdocs) throws IOException, SolrException;
/**
* get a document from solr by given id
* @param id
* @return one result or null if no result exists
* @throws IOException
*/
public SolrDocument get(final String id) throws IOException;
/**
* get a query result from solr
* to get all results set the query String to "*:*"
* @param querystring
* @throws IOException
*/
public SolrDocumentList get(final String querystring, final int offset, final int count) throws IOException;
public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException;
/**
* get the size of the index

@ -24,8 +24,8 @@ package net.yacy.kelondro.data.meta;
import java.util.Date;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.rwi.Reference;
public interface URIMetadata extends URIReference {
@ -74,10 +74,12 @@ public interface URIMetadata extends URIReference {
public String snippet();
public Reference word();
public WordReference word();
public boolean isOlder(final URIMetadata other);
public String toString(final String snippet);
public byte[] referrerHash();
}

@ -35,6 +35,12 @@ public interface URIReference {
*/
public byte[] hash();
/**
* the second half of a uri hash is the host hash
* @return
*/
public String hosthash();
/**
* The modification date of the URIReference is given if
* the record was created first and is defined with the

@ -49,6 +49,14 @@ public class URIReferenceNode extends HashMap<String, byte[]> implements URIRefe
return this.hash;
}
private String hostHash = null;
@Override
public String hosthash() {
if (this.hostHash != null) return this.hostHash;
this.hostHash = ASCII.String(this.hash, 6, 6);
return this.hostHash;
}
@Override
public Date moddate() {
byte[] x = this.get(MetadataVocabulary.moddate.name());

@ -29,7 +29,6 @@ import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSMessage;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MapTools;
import net.yacy.peers.operation.yacyVersion;

@ -77,6 +77,7 @@ import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
@ -1155,7 +1156,7 @@ public final class Protocol
public static String transferIndex(
final Seed targetSeed,
final ReferenceContainerCache<WordReference> indexes,
final SortedMap<byte[], URIMetadataRow> urlCache,
final SortedMap<byte[], URIMetadata> urlCache,
final boolean gzipBody,
final int timeout) {
@ -1216,7 +1217,7 @@ public final class Protocol
} // all url's known
// extract the urlCache from the result
final URIMetadataRow[] urls = new URIMetadataRow[uhs.length];
final URIMetadata[] urls = new URIMetadataRow[uhs.length];
for ( int i = 0; i < uhs.length; i++ ) {
urls[i] = urlCache.get(ASCII.getBytes(uhs[i]));
if ( urls[i] == null ) {
@ -1324,7 +1325,7 @@ public final class Protocol
private static Map<String, String> transferURL(
final Seed targetSeed,
final URIMetadataRow[] urls,
final URIMetadata[] urls,
boolean gzipBody,
final int timeout) {
// this post a message to the remote message board
@ -1346,7 +1347,7 @@ public final class Protocol
String resource;
int urlc = 0;
int urlPayloadSize = 0;
for ( final URIMetadataRow url : urls ) {
for ( final URIMetadata url : urls ) {
if ( url != null ) {
resource = url.toString();
//System.out.println("*** DEBUG resource = " + resource);

@ -32,7 +32,7 @@ import java.util.SortedMap;
import java.util.TreeMap;
import net.yacy.cora.document.ASCII;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
@ -90,7 +90,7 @@ public class Transmission {
*/
private final byte[] primaryTarget;
private final ReferenceContainerCache<WordReference> containers;
private final SortedMap<byte[], URIMetadataRow> references;
private final SortedMap<byte[], URIMetadata> references;
private final HandleSet badReferences;
private final List<Seed> targets;
private int hit, miss;
@ -106,7 +106,7 @@ public class Transmission {
super();
this.primaryTarget = primaryTarget;
this.containers = new ReferenceContainerCache<WordReference>(Segment.wordReferenceFactory, Segment.wordOrder, Word.commonHashLength);
this.references = new TreeMap<byte[], URIMetadataRow>(Base64Order.enhancedCoder);
this.references = new TreeMap<byte[], URIMetadata>(Base64Order.enhancedCoder);
this.badReferences = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0);
this.targets = targets;
this.hit = 0;
@ -175,7 +175,7 @@ public class Transmission {
notFoundx.add(e.urlhash());
continue;
}
final URIMetadataRow r = Transmission.this.segment.urlMetadata().load(e.urlhash());
final URIMetadata r = Transmission.this.segment.urlMetadata().load(e.urlhash());
if (r == null) {
notFoundx.add(e.urlhash());
this.badReferences.put(e.urlhash());

@ -45,6 +45,7 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
@ -332,7 +333,7 @@ public class Blacklist {
* @param entry Entry to be checked
* @return Whether the given entry is blacklisted
*/
public boolean isListed(final BlacklistType blacklistType, final URIMetadataRow entry) {
public boolean isListed(final BlacklistType blacklistType, final URIMetadata entry) {
// Call inner method
return isListed(blacklistType, entry.url());
}

@ -111,6 +111,7 @@ import net.yacy.document.parser.html.Evaluation;
import net.yacy.gui.Tray;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet;
@ -391,8 +392,12 @@ public final class Switchboard extends serverSwitch
fileSizeMax,
this.useTailCache,
this.exceed134217727,
solrLocal);
solrLocal,
true, // useCitationIndex
true, // useRWI
true // useMetadata
);
// prepare a solr index profile switch list
final File solrBackupProfile = new File("defaults/solr.keys.list");
final String schemename =
@ -1197,7 +1202,11 @@ public final class Switchboard extends serverSwitch
fileSizeMax,
this.useTailCache,
this.exceed134217727,
solrLocal);
solrLocal,
true, // useCitationIndex
true, // useRWI
true // useMetadata
);
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
// create a crawler
@ -1447,7 +1456,7 @@ public final class Switchboard extends serverSwitch
if ( urlhash.length == 0 ) {
return null;
}
final URIMetadataRow le = this.index.urlMetadata().load(urlhash);
final URIMetadata le = this.index.urlMetadata().load(urlhash);
if ( le != null ) {
return le.url();
}

@ -41,6 +41,7 @@ import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.query.QueryParams;
@ -74,7 +75,19 @@ public class DocumentIndex extends Segment
public DocumentIndex(final File segmentPath, final CallbackListener callback, final int cachesize)
throws IOException {
super(new Log("DocumentIndex"), segmentPath, cachesize, targetFileSize * 4 - 1, false, false, true);
super(
new Log("DocumentIndex"),
segmentPath,
cachesize,
targetFileSize * 4 - 1,
false, // useTailCache
false, // exceed134217727
true, // connectLocalSolr
true, // useCitationIndex
true, // useRWI
true // useMetadata
);
final int cores = Runtime.getRuntime().availableProcessors() + 1;
this.callback = callback;
this.queue = new LinkedBlockingQueue<DigestURI>(cores * 300);
@ -227,7 +240,7 @@ public class DocumentIndex extends Segment
rankedCache.start();
// search is running; retrieve results
URIMetadataRow row;
URIMetadata row;
final ArrayList<DigestURI> files = new ArrayList<DigestURI>();
while ( (row = rankedCache.takeURL(false, 1000)) != null ) {
files.add(row.url());

@ -49,6 +49,7 @@ import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.Cache;
@ -65,43 +66,38 @@ import net.yacy.search.Switchboard;
import net.yacy.search.solr.EmbeddedSolrConnector;
import org.apache.lucene.util.Version;
import de.anomic.crawler.CrawlStacker;
public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]> {
// class objects
protected Index urlIndexFile;
private final File location;
private Index urlIndexFile;
private Export exportthread; // will have a export thread assigned if exporter is running
private final File location;
private final String tablename;
private String tablename;
private ArrayList<HostStat> statsDump;
private SolrConnector localSolr, remoteSolr;
public MetadataRepository(
final File path,
final String tablename,
final boolean useTailCache,
final boolean exceed134217727) {
public MetadataRepository(final File path) {
this.location = path;
this.tablename = tablename;
Index backupIndex = null;
backupIndex = new SplitTable(this.location, tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727);
this.urlIndexFile = backupIndex; //new Cache(backupIndex, 20000000, 20000000);
this.tablename = null;
this.urlIndexFile = null;
this.exportthread = null; // will have a export thread assigned if exporter is running
this.statsDump = null;
this.remoteSolr = null;
this.localSolr = null;
}
public void connectRemoteSolr(final SolrConnector solr) {
this.remoteSolr = solr;
public void connectUrlDb(final String tablename, final boolean useTailCache, final boolean exceed134217727) {
if (this.urlIndexFile != null) return;
this.tablename = tablename;
this.urlIndexFile = new SplitTable(this.location, tablename, URIMetadataRow.rowdef, useTailCache, exceed134217727);
}
public void disconnectRemoteSolr() {
if (this.remoteSolr == null) return;
this.remoteSolr.close();
this.remoteSolr = null;
public void disconnectUrlDb() {
if (this.urlIndexFile == null) return;
this.urlIndexFile.close();
this.urlIndexFile = null;
}
public void connectLocalSolr() throws IOException {
@ -123,6 +119,16 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.localSolr.close();
this.localSolr = null;
}
public void connectRemoteSolr(final SolrConnector solr) {
this.remoteSolr = solr;
}
public void disconnectRemoteSolr() {
if (this.remoteSolr == null) return;
this.remoteSolr.close();
this.remoteSolr = null;
}
public SolrConnector getLocalSolr() {
return this.localSolr;
@ -133,7 +139,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
}
public void clearCache() {
if (this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
if (this.statsDump != null) this.statsDump.clear();
this.statsDump = null;
}
@ -142,15 +148,22 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
if (this.exportthread != null) this.exportthread.interrupt();
if (this.urlIndexFile == null) {
SplitTable.delete(this.location, this.tablename);
this.urlIndexFile = new SplitTable(this.location, this.tablename, URIMetadataRow.rowdef, false, false);
} else {
this.urlIndexFile.clear();
}
if (this.localSolr != null) {
this.localSolr.clear();
}
// the remote solr is not cleared here because that shall be done separately
this.statsDump = null;
}
public int size() {
return this.urlIndexFile == null ? 0 : this.urlIndexFile.size();
int size = 0;
size += this.urlIndexFile == null ? 0 : this.urlIndexFile.size();
size += this.localSolr == null ? 0 : this.localSolr.getSize();
size += this.remoteSolr == null ? 0 : this.remoteSolr.getSize();
return size;
}
public void close() {
@ -170,8 +183,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
}
public int writeCacheSize() {
if (this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize();
if (this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize();
if (this.urlIndexFile != null && this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize();
if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize();
return 0;
}
@ -181,59 +194,69 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
* @param obrwi
* @return
*/
public URIMetadataRow load(final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi) {
if (this.urlIndexFile == null) return null;
public URIMetadata load(final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi) {
if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
final byte[] urlHash = obrwi.getElement().urlhash();
if (urlHash == null) return null;
try {
if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry == null) return null;
return new URIMetadataRow(entry, obrwi.getElement(), obrwi.getWeight());
} catch (final IOException e) {
return null;
Log.logException(e);
}
/*
if (this.localSolr != null) {
try {
SolrDocument doc = this.localSolr.get(ASCII.String(urlHash));
} catch (IOException e) {
Log.logException(e);
}
}
*/
return null;
}
public URIMetadataRow load(final byte[] urlHash) {
if (this.urlIndexFile == null) return null;
public URIMetadata load(final byte[] urlHash) {
if (urlHash == null) return null;
try {
if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry == null) return null;
return new URIMetadataRow(entry, null, 0);
} catch (final IOException e) {
return null;
}
return null;
}
public void store(final URIMetadataRow entry) throws IOException {
public void store(final URIMetadata entry) throws IOException {
// Check if there is a more recent Entry already in the DB
URIMetadataRow oldEntry;
if (this.urlIndexFile == null) return; // case may happen during shutdown or startup
try {
final Row.Entry oe = this.urlIndexFile.get(entry.hash(), false);
oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0);
} catch (final Exception e) {
Log.logException(e);
oldEntry = null;
if (this.urlIndexFile != null && entry instanceof URIMetadataRow) {
URIMetadata oldEntry = null;
try {
final Row.Entry oe = this.urlIndexFile.get(entry.hash(), false);
oldEntry = (oe == null) ? null : new URIMetadataRow(oe, null, 0);
} catch (final Exception e) {
Log.logException(e);
oldEntry = null;
}
if (oldEntry != null && entry.isOlder(oldEntry)) {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
// doesn't make sense, since no return value:
//entry = oldEntry;
return; // this did not need to be stored, but is updated
}
try {
this.urlIndexFile.put(((URIMetadataRow) entry).toRowEntry());
} catch (final RowSpaceExceededException e) {
throw new IOException("RowSpaceExceededException in " + this.urlIndexFile.filename() + ": " + e.getMessage());
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache();
}
if (oldEntry != null && entry.isOlder(oldEntry)) {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
// doesn't make sense, since no return value:
//entry = oldEntry;
return; // this did not need to be stored, but is updated
}
try {
this.urlIndexFile.put(entry.toRowEntry());
} catch (final RowSpaceExceededException e) {
throw new IOException("RowSpaceExceededException in " + this.urlIndexFile.filename() + ": " + e.getMessage());
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCache() ;
}
public boolean remove(final byte[] urlHash) {
@ -251,13 +274,14 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
Log.logException(e);
}
}
try {
if (this.urlIndexFile != null) try {
final Row.Entry r = this.urlIndexFile.remove(urlHash);
if (r != null) this.statsDump = null;
return r != null;
} catch (final IOException e) {
return false;
}
return false;
}
public boolean exists(final byte[] urlHash) {
@ -297,17 +321,17 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
return keys(true, null);
}
public CloneableIterator<URIMetadataRow> entries() throws IOException {
public CloneableIterator<URIMetadata> entries() throws IOException {
// enumerates entry elements
return new kiter();
}
public CloneableIterator<URIMetadataRow> entries(final boolean up, final String firstHash) throws IOException {
public CloneableIterator<URIMetadata> entries(final boolean up, final String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, firstHash);
}
public class kiter implements CloneableIterator<URIMetadataRow> {
public class kiter implements CloneableIterator<URIMetadata> {
// enumerates entry elements
private final CloneableIterator<Row.Entry> iter;
private final boolean error;
@ -342,7 +366,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
}
@Override
public final URIMetadataRow next() {
public final URIMetadata next() {
Row.Entry e = null;
if (this.iter == null) { return null; }
if (this.iter.hasNext()) { e = this.iter.next(); }
@ -372,7 +396,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
final Log log = new Log("URLDBCLEANUP");
final HashSet<String> damagedURLS = new HashSet<String>();
try {
final Iterator<URIMetadataRow> eiter = entries(true, null);
final Iterator<URIMetadata> eiter = entries(true, null);
int iteratorCount = 0;
while (eiter.hasNext()) try {
eiter.next();
@ -456,7 +480,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
public void run() {
try {
Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet");
final Iterator<URIMetadataRow> eiter = entries(true, null);
final Iterator<URIMetadata> eiter = entries(true, null);
while (eiter.hasNext() && this.run) {
synchronized (this) {
if (this.pause) {
@ -469,7 +493,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
}
}
}
final URIMetadataRow entry = eiter.next();
final URIMetadata entry = eiter.next();
if (entry == null) {
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null");
} else if (entry.hash() == null) {
@ -605,8 +629,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.count++;
}
} else {
final Iterator<URIMetadataRow> i = entries(); // iterates indexURLEntry objects
URIMetadataRow entry;
final Iterator<URIMetadata> i = entries(); // iterates indexURLEntry objects
URIMetadata entry;
String url;
while (i.hasNext()) {
entry = i.next();
@ -704,7 +728,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
// collect hashes from all domains
// fetch urls from the database to determine the host in clear text
URIMetadataRow urlref;
URIMetadata urlref;
if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
this.statsDump = new ArrayList<HostStat>();
final TreeSet<String> set = new TreeSet<String>();
@ -741,7 +765,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
*/
public Map<String, HostStat> domainHashResolver(final Map<String, URLHashCounter> domainSamples) {
final HashMap<String, HostStat> hostMap = new HashMap<String, HostStat>();
URIMetadataRow urlref;
URIMetadata urlref;
final ScoreMap<String> hosthashScore = new ConcurrentScoreMap<String>();
for (final Map.Entry<String, URLHashCounter> e: domainSamples.entrySet()) {
@ -762,7 +786,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
// fetch urls from the database to determine the host in clear text
final Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first)
URIMetadataRow urlref;
URIMetadata urlref;
String urlhash;
count += 10; // make some more to prevent that we have to do this again after deletions too soon.
if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size();

@ -47,6 +47,7 @@ import net.yacy.document.Parser;
import net.yacy.kelondro.data.citation.CitationReference;
import net.yacy.kelondro.data.citation.CitationReferenceFactory;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
@ -88,7 +89,8 @@ public class Segment {
public static final int lowcachedivisor = 900;
public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB
public static final int writeBufferSize = 4 * 1024 * 1024;
public static final String UrlDbName = "text.urlmd";
// the reference factory
public static final ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory();
public static final ReferenceFactory<CitationReference> citationReferenceFactory = new CitationReferenceFactory();
@ -109,14 +111,17 @@ public class Segment {
final long maxFileSize,
final boolean useTailCache,
final boolean exceed134217727,
final boolean connectLocalSolr) throws IOException {
final boolean connectLocalSolr,
final boolean useCitationIndex,
final boolean useRWI,
final boolean useMetadata) throws IOException {
log.logInfo("Initializing Segment '" + segmentPath + ".");
this.log = log;
this.segmentPath = segmentPath;
this.termIndex = new IndexCell<WordReference>(
this.termIndex = useRWI ? new IndexCell<WordReference>(
segmentPath,
"text.index",
wordReferenceFactory,
@ -125,9 +130,9 @@ public class Segment {
entityCacheMaxSize,
targetFileSize,
maxFileSize,
writeBufferSize);
writeBufferSize) : null;
this.urlCitationIndex = new IndexCell<CitationReference>(
this.urlCitationIndex = useCitationIndex ? new IndexCell<CitationReference>(
segmentPath,
"citation.index",
citationReferenceFactory,
@ -136,10 +141,11 @@ public class Segment {
entityCacheMaxSize,
targetFileSize,
maxFileSize,
writeBufferSize);
writeBufferSize) : null;
// create LURL-db
this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727);
this.urlMetadata = new MetadataRepository(segmentPath);
if (useMetadata) this.urlMetadata.connectUrlDb(UrlDbName, useTailCache, exceed134217727);
if (connectLocalSolr) this.connectLocalSolr();
}
@ -148,10 +154,12 @@ public class Segment {
}
public long RWICount() {
if (this.termIndex == null) return 0;
return this.termIndex.sizesMax();
}
public int RWIBufferCount() {
if (this.termIndex == null) return 0;
return this.termIndex.getBufferSize();
}
@ -235,7 +243,7 @@ public class Segment {
}
@Override
public DigestURI next() {
URIMetadataRow umr = Segment.this.urlMetadata.load(bi.next());
URIMetadata umr = Segment.this.urlMetadata.load(bi.next());
return umr.url();
}
@Override
@ -260,9 +268,9 @@ public class Segment {
public void clear() {
try {
this.termIndex.clear();
this.urlMetadata.clear();
this.urlCitationIndex.clear();
if (this.termIndex != null) this.termIndex.clear();
if (this.urlMetadata != null) this.urlMetadata.clear();
if (this.urlCitationIndex != null) this.urlCitationIndex.clear();
} catch (final IOException e) {
Log.logException(e);
}
@ -328,7 +336,7 @@ public class Segment {
assert (wprop.flags != null);
ientry.setWord(wprop);
wordhash = Word.word2hash(word);
try {
if (this.termIndex != null) try {
this.termIndex.add(wordhash, ientry);
} catch (final Exception e) {
Log.logException(e);
@ -354,7 +362,7 @@ public class Segment {
// assign the catchall word
ientry.setWord(wprop == null ? catchallWord : wprop); // we use one of the word properties as template to get the document characteristics
try {
if (this.termIndex != null) try {
this.termIndex.add(catchallHash, ientry);
} catch (final Exception e) {
Log.logException(e);
@ -385,9 +393,9 @@ public class Segment {
}
public synchronized void close() {
this.termIndex.close();
this.urlMetadata.close();
this.urlCitationIndex.close();
if (this.termIndex != null) this.termIndex.close();
if (this.urlMetadata != null) this.urlMetadata.close();
if (this.urlCitationIndex != null) this.urlCitationIndex.close();
}
public URIMetadataRow storeDocument(
@ -541,7 +549,7 @@ public class Segment {
if (urlhash == null) return 0;
// determine the url string
final URIMetadataRow entry = urlMetadata().load(urlhash);
final URIMetadata entry = urlMetadata().load(urlhash);
if (entry == null) return 0;
if (entry.url() == null) return 0;
@ -612,7 +620,7 @@ public class Segment {
entry = new WordReferenceVars(containerIterator.next());
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
final URIMetadataRow ue = Segment.this.urlMetadata.load(entry.urlhash());
final URIMetadata ue = Segment.this.urlMetadata.load(entry.urlhash());
if (ue == null) {
urlHashs.put(entry.urlhash());
} else {

@ -55,6 +55,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.document.Condenser;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
@ -616,7 +617,7 @@ public final class RWIProcess extends Thread
* @param waitingtime the time this method may take for a result computation
* @return a metadata entry for a url
*/
public URIMetadataRow takeURL(final boolean skipDoubleDom, final long waitingtime) {
public URIMetadata takeURL(final boolean skipDoubleDom, final long waitingtime) {
// returns from the current RWI list the best URL entry and removes this entry from the list
final long timeout = System.currentTimeMillis() + Math.max(10, waitingtime);
int p = -1;
@ -627,7 +628,7 @@ public final class RWIProcess extends Thread
if ( obrwi == null ) {
return null; // all time was already wasted in takeRWI to get another element
}
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi);
final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi);
if ( page == null ) {
try {
this.misses.putUnique(obrwi.getElement().urlhash());
@ -864,7 +865,7 @@ public final class RWIProcess extends Thread
}
final Iterator<String> domhashs = this.hostNavigator.keys(false);
URIMetadataRow row;
URIMetadata row;
byte[] urlhash;
String hosthash, hostname;
if ( this.hostResolver != null ) {

@ -41,7 +41,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
@ -454,7 +454,7 @@ public class SnippetProcess {
public void run() {
// start fetching urls and snippets
URIMetadataRow page;
URIMetadata page;
ResultEntry resultEntry;
//final int fetchAhead = snippetMode == 0 ? 0 : 10;
final boolean nav_topics = SnippetProcess.this.query.navigators.equals("all") || SnippetProcess.this.query.navigators.indexOf("topics",0) >= 0;
@ -498,7 +498,7 @@ public class SnippetProcess {
String solrContent = null;
if (this.solr != null) {
SolrDocument sd = null;
final SolrDocumentList sdl = this.solr.get(SolrField.id.getSolrFieldName()+ ":" + ASCII.String(page.hash()), 0, 1);
final SolrDocumentList sdl = this.solr.query(SolrField.id.getSolrFieldName()+ ":" + ASCII.String(page.hash()), 0, 1);
if (!sdl.isEmpty()) {
sd = sdl.get(0);
}
@ -553,7 +553,7 @@ public class SnippetProcess {
}
}
protected ResultEntry fetchSnippet(final URIMetadataRow page, final String solrText, final CacheStrategy cacheStrategy) {
protected ResultEntry fetchSnippet(final URIMetadata page, final String solrText, final CacheStrategy cacheStrategy) {
// Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets
// 1 - fetch snippets offline only

@ -34,7 +34,7 @@ import java.util.List;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.logging.Log;
@ -50,7 +50,7 @@ import net.yacy.search.index.Segment;
public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEntry> {
// payload objects
private final URIMetadataRow urlentry;
private final URIMetadata urlentry;
private String alternative_urlstring;
private String alternative_urlname;
private final TextSnippet textSnippet;
@ -60,7 +60,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
// statistic objects
public long dbRetrievalTime, snippetComputationTime, ranking;
public ResultEntry(final URIMetadataRow urlentry,
public ResultEntry(final URIMetadata urlentry,
final Segment indexSegment,
SeedDB peers,
final TextSnippet textSnippet,

@ -45,7 +45,7 @@ import net.yacy.document.SnippetExtractor;
import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.order.Base64Order;
@ -146,7 +146,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
public TextSnippet(
final LoaderDispatcher loader,
final String solrText,
final URIMetadataRow row,
final URIMetadata row,
final HandleSet queryhashes,
final CacheStrategy cacheStrategy,
final boolean pre,

@ -155,7 +155,7 @@ public class EmbeddedSolrConnector extends AbstractSolrConnector implements Solr
solrdoc.addSolr(SolrField.text_t, "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.");
solr.add(solrdoc);
SolrServlet.startServer("/solr", 8091, solr);
SolrDocumentList searchresult = solr.get(SolrField.text_t.name() + ":tempor", 0, 10);
SolrDocumentList searchresult = solr.query(SolrField.text_t.name() + ":tempor", 0, 10);
for (SolrDocument d : searchresult) {
System.out.println(d.toString());
}

@ -1,4 +1,3 @@
package net.yacy;
// yacy.java
// -----------------------
// (C) by Michael Peter Christen; mc@yacy.net
@ -23,8 +22,8 @@ package net.yacy;
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy;
//import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
@ -61,7 +60,7 @@ import net.yacy.cora.sorting.ScoreMap;
import net.yacy.gui.YaCyApp;
import net.yacy.gui.framework.Browser;
import net.yacy.kelondro.blob.MapDataMining;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
@ -657,11 +656,13 @@ public final class yacy {
log.logInfo("STARTING URL CLEANUP");
// db containing all currently loades urls
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"), "text.urlmd", false, false);
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"));
currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
// db used to hold all neede urls
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"), "text.urlmd", false, false);
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"));
minimizedUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
@ -669,7 +670,14 @@ public final class yacy {
log,
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
10000,
Integer.MAX_VALUE, false, false, false);
Integer.MAX_VALUE,
false, // useTailCache
false, // exceed134217727
false, // connectLocalSolr
false, // useCitationIndex
true, // useRWI
true // useMetadata
);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false);
long urlCounter = 0, wordCounter = 0;
@ -689,7 +697,7 @@ public final class yacy {
iEntry = wordIdxEntries.next();
final byte[] urlHash = iEntry.urlhash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
final URIMetadataRow urlEntry = currentUrlDB.load(urlHash);
final URIMetadata urlEntry = currentUrlDB.load(urlHash);
urlCounter++;
minimizedUrlDB.store(urlEntry);
if (urlCounter % 500 == 0) {
@ -829,7 +837,8 @@ public final class yacy {
final File root = dataHome;
final File indexroot = new File(root, "DATA/INDEX");
try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"), "text.urlmd", false, false);
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"));
currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
currentUrlDB.deadlinkCleaner();
currentUrlDB.close();
}
@ -849,7 +858,14 @@ public final class yacy {
log,
new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"),
10000,
Integer.MAX_VALUE, false, false, false);
Integer.MAX_VALUE,
false, // useTailCache
false, // exceed134217727
false, // connectLocalSolr
false, // useCitationIndex
true, // useRWI
true // useMetadata
);
indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);
}
int counter = 0;

Loading…
Cancel
Save