diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 9ccb623f8..2f6ceaa56 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -68,52 +68,51 @@ public class Vocabulary_p { boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); - if (discoveruri != null) { - Segment segment = sb.index; - Iterator ui = segment.urlSelector(discoveruri); - String t; - while (ui.hasNext()) { - DigestURI u = ui.next(); - String u0 = u.toNormalform(true); - t = ""; - if (discoverFromPath) { - t = u0.substring(discoverobjectspace.length()); - if (t.indexOf('/') >= 0) continue; - int p = t.indexOf('.'); - if (p >= 0) t = t.substring(0, p); - while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); - while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); - if (p >= 0) t = t.substring(p + 1); - } - if (discoverFromTitle || discoverFromTitleSplitted) { - URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); - if (m != null) t = m.dc_title(); - if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; - } - if (discoverFromAuthor) { - URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); - if (m != null) t = m.dc_creator(); + Segment segment = sb.index; + Iterator ui = segment.urlSelector(discoveruri, 600000L, 100000); + String t; + while (ui.hasNext()) { + DigestURI u = ui.next(); + String u0 = u.toNormalform(true); + t = ""; + if (discoverFromPath) { + int exp = u0.lastIndexOf('.'); + if (exp < 0) continue; + int slp = u0.lastIndexOf('/', exp); + if (slp < 0) continue; + t = u0.substring(slp, exp); + int p; + while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); + while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); + } + if (discoverFromTitle || discoverFromTitleSplitted) { + URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); + if (m != null) t = m.dc_title(); + if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; + } + if (discoverFromAuthor) { + URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); + if (m != null) t = m.dc_creator(); + } + t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); + if (t.isEmpty()) continue; + if (discoverFromTitleSplitted) { + String[] ts = t.split(" "); + for (String s: ts) { + if (s.isEmpty()) continue; + if (s.endsWith(".jpg") || s.endsWith(".gif")) continue; + table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); } - t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); - if (t.isEmpty()) continue; - if (discoverFromTitleSplitted) { - String[] ts = t.split(" "); - for (String s: ts) { - if (s.isEmpty()) continue; - if (s.endsWith(".jpg") || s.endsWith(".gif")) continue; - table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); - } - } else if (discoverFromAuthor) { - String[] ts = t.split(";"); // author names are often separated by ';' - for (String s: ts) { - if (s.isEmpty()) continue; - int p = s.indexOf(','); // check if there is a reversed method to mention the name - if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim(); - table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); - } - } else { - table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0)); + } else if (discoverFromAuthor) { + String[] ts = t.split(";"); // author names are often separated by ';' + for (String s: ts) { + if (s.isEmpty()) continue; + int p = s.indexOf(','); // check if there is a reversed method to mention the name + if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim(); + table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); } + } else { + table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0)); } } Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index fd068b6fd..e3bfc69c7 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -28,6 +28,7 @@ package net.yacy.search.index; import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; import java.util.Date; import java.util.Iterator; import java.util.Map; @@ -218,28 +219,41 @@ public class Segment { * @param stub * @return an iterator for all matching urls */ - public Iterator urlSelector(MultiProtocolURI stub) { - final String host = stub.getHost(); - String hh = DigestURI.hosthash(host); - final BlockingQueue hostQueue = this.fulltext.getSolr().concurrentIDs(YaCySchema.host_id_s + ":" + hh, 0, Integer.MAX_VALUE, 10000); - - final String urlstub = stub.toNormalform(true); + public Iterator urlSelector(final MultiProtocolURI stub, final long maxtime, final int maxcount) { + final BlockingQueue docQueue; + final String urlstub; + if (stub == null) { + docQueue = this.fulltext.getSolr().concurrentQuery("*:*", 0, Integer.MAX_VALUE, maxtime, maxcount, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName()); + urlstub = null; + } else { + final String host = stub.getHost(); + String hh = DigestURI.hosthash(host); + docQueue = this.fulltext.getSolr().concurrentQuery(YaCySchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, maxtime, maxcount, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName()); + urlstub = stub.toNormalform(true); + } // now filter the stub from the iterated urls return new LookAheadIterator() { @Override protected DigestURI next0() { while (true) { - String id; + SolrDocument doc; try { - id = hostQueue.take(); + doc = docQueue.take(); } catch (InterruptedException e) { Log.logException(e); return null; } - if (id == null || id == AbstractSolrConnector.POISON_ID) return null; - DigestURI u = Segment.this.fulltext.getURL(ASCII.getBytes(id)); - if (u.toNormalform(true).startsWith(urlstub)) return u; + if (doc == null || doc == AbstractSolrConnector.POISON_DOCUMENT) return null; + String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); + String id = (String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()); + DigestURI url; + try { + url = new DigestURI(u, ASCII.getBytes(id)); + } catch (MalformedURLException e) { + continue; + } + if (urlstub == null || u.startsWith(urlstub)) return url; } } };