diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 9ccb623f8..2f6ceaa56 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -68,52 +68,51 @@ public class Vocabulary_p { boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); - if (discoveruri != null) { - Segment segment = sb.index; - Iterator ui = segment.urlSelector(discoveruri); - String t; - while (ui.hasNext()) { - DigestURI u = ui.next(); - String u0 = u.toNormalform(true); - t = ""; - if (discoverFromPath) { - t = u0.substring(discoverobjectspace.length()); - if (t.indexOf('/') >= 0) continue; - int p = t.indexOf('.'); - if (p >= 0) t = t.substring(0, p); - while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); - while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); - if (p >= 0) t = t.substring(p + 1); - } - if (discoverFromTitle || discoverFromTitleSplitted) { - URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); - if (m != null) t = m.dc_title(); - if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; - } - if (discoverFromAuthor) { - URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); - if (m != null) t = m.dc_creator(); + Segment segment = sb.index; + Iterator ui = segment.urlSelector(discoveruri, 600000L, 100000); + String t; + while (ui.hasNext()) { + DigestURI u = ui.next(); + String u0 = u.toNormalform(true); + t = ""; + if (discoverFromPath) { + int exp = u0.lastIndexOf('.'); + if (exp < 0) continue; + int slp = u0.lastIndexOf('/', exp); + if (slp < 0) continue; + t = u0.substring(slp, exp); + int p; + while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); + while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); + } + if (discoverFromTitle || discoverFromTitleSplitted) { + URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); + if (m != null) t = m.dc_title(); + if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; + } + if (discoverFromAuthor) { + URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); + if (m != null) t = m.dc_creator(); + } + t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); + if (t.isEmpty()) continue; + if (discoverFromTitleSplitted) { + String[] ts = t.split(" "); + for (String s: ts) { + if (s.isEmpty()) continue; + if (s.endsWith(".jpg") || s.endsWith(".gif")) continue; + table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); } - t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); - if (t.isEmpty()) continue; - if (discoverFromTitleSplitted) { - String[] ts = t.split(" "); - for (String s: ts) { - if (s.isEmpty()) continue; - if (s.endsWith(".jpg") || s.endsWith(".gif")) continue; - table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); - } - } else if (discoverFromAuthor) { - String[] ts = t.split(";"); // author names are often separated by ';' - for (String s: ts) { - if (s.isEmpty()) continue; - int p = s.indexOf(','); // check if there is a reversed method to mention the name - if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim(); - table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); - } - } else { - table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0)); + } else if (discoverFromAuthor) { + String[] ts = t.split(";"); // author names are often separated by ';' + for (String s: ts) { + if (s.isEmpty()) continue; + int p = s.indexOf(','); // check if there is a reversed method to mention the name + if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim(); + table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); } + } else { + table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0)); } } Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 3adfcb6a8..e3bfc69c7 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -219,12 +219,18 @@ public class Segment { * @param stub * @return an iterator for all matching urls */ - public Iterator urlSelector(MultiProtocolURI stub) { - final String host = stub.getHost(); - String hh = DigestURI.hosthash(host); - final BlockingQueue docQueue = this.fulltext.getSolr().concurrentQuery(YaCySchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, 600000L, 100000, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName()); - - final String urlstub = stub.toNormalform(true); + public Iterator urlSelector(final MultiProtocolURI stub, final long maxtime, final int maxcount) { + final BlockingQueue docQueue; + final String urlstub; + if (stub == null) { + docQueue = this.fulltext.getSolr().concurrentQuery("*:*", 0, Integer.MAX_VALUE, maxtime, maxcount, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName()); + urlstub = null; + } else { + final String host = stub.getHost(); + String hh = DigestURI.hosthash(host); + docQueue = this.fulltext.getSolr().concurrentQuery(YaCySchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, maxtime, maxcount, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName()); + urlstub = stub.toNormalform(true); + } // now filter the stub from the iterated urls return new LookAheadIterator() { @@ -247,7 +253,7 @@ public class Segment { } catch (MalformedURLException e) { continue; } - if (u.startsWith(urlstub)) return url; + if (urlstub == null || u.startsWith(urlstub)) return url; } } };