ability to create vocabularies also without any objectspace: this

iterates over all urls in the index do create terms
pull/1/head
orbiter 12 years ago
parent ecc10a752c
commit a2160054d7

@ -68,52 +68,51 @@ public class Vocabulary_p {
boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
if (discoveruri != null) { Segment segment = sb.index;
Segment segment = sb.index; Iterator<DigestURI> ui = segment.urlSelector(discoveruri, 600000L, 100000);
Iterator<DigestURI> ui = segment.urlSelector(discoveruri); String t;
String t; while (ui.hasNext()) {
while (ui.hasNext()) { DigestURI u = ui.next();
DigestURI u = ui.next(); String u0 = u.toNormalform(true);
String u0 = u.toNormalform(true); t = "";
t = ""; if (discoverFromPath) {
if (discoverFromPath) { int exp = u0.lastIndexOf('.');
t = u0.substring(discoverobjectspace.length()); if (exp < 0) continue;
if (t.indexOf('/') >= 0) continue; int slp = u0.lastIndexOf('/', exp);
int p = t.indexOf('.'); if (slp < 0) continue;
if (p >= 0) t = t.substring(0, p); t = u0.substring(slp, exp);
while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); int p;
while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
if (p >= 0) t = t.substring(p + 1); while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
} }
if (discoverFromTitle || discoverFromTitleSplitted) { if (discoverFromTitle || discoverFromTitleSplitted) {
URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
if (m != null) t = m.dc_title(); if (m != null) t = m.dc_title();
if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
} }
if (discoverFromAuthor) { if (discoverFromAuthor) {
URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
if (m != null) t = m.dc_creator(); if (m != null) t = m.dc_creator();
}
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();
if (t.isEmpty()) continue;
if (discoverFromTitleSplitted) {
String[] ts = t.split(" ");
for (String s: ts) {
if (s.isEmpty()) continue;
if (s.endsWith(".jpg") || s.endsWith(".gif")) continue;
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
} }
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); } else if (discoverFromAuthor) {
if (t.isEmpty()) continue; String[] ts = t.split(";"); // author names are often separated by ';'
if (discoverFromTitleSplitted) { for (String s: ts) {
String[] ts = t.split(" "); if (s.isEmpty()) continue;
for (String s: ts) { int p = s.indexOf(','); // check if there is a reversed method to mention the name
if (s.isEmpty()) continue; if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
if (s.endsWith(".jpg") || s.endsWith(".gif")) continue; table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
}
} else if (discoverFromAuthor) {
String[] ts = t.split(";"); // author names are often separated by ';'
for (String s: ts) {
if (s.isEmpty()) continue;
int p = s.indexOf(','); // check if there is a reversed method to mention the name
if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
}
} else {
table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0));
} }
} else {
table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0));
} }
} }
Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);

@ -219,12 +219,18 @@ public class Segment {
* @param stub * @param stub
* @return an iterator for all matching urls * @return an iterator for all matching urls
*/ */
public Iterator<DigestURI> urlSelector(MultiProtocolURI stub) { public Iterator<DigestURI> urlSelector(final MultiProtocolURI stub, final long maxtime, final int maxcount) {
final String host = stub.getHost(); final BlockingQueue<SolrDocument> docQueue;
String hh = DigestURI.hosthash(host); final String urlstub;
final BlockingQueue<SolrDocument> docQueue = this.fulltext.getSolr().concurrentQuery(YaCySchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, 600000L, 100000, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName()); if (stub == null) {
docQueue = this.fulltext.getSolr().concurrentQuery("*:*", 0, Integer.MAX_VALUE, maxtime, maxcount, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName());
final String urlstub = stub.toNormalform(true); urlstub = null;
} else {
final String host = stub.getHost();
String hh = DigestURI.hosthash(host);
docQueue = this.fulltext.getSolr().concurrentQuery(YaCySchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, maxtime, maxcount, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName());
urlstub = stub.toNormalform(true);
}
// now filter the stub from the iterated urls // now filter the stub from the iterated urls
return new LookAheadIterator<DigestURI>() { return new LookAheadIterator<DigestURI>() {
@ -247,7 +253,7 @@ public class Segment {
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
continue; continue;
} }
if (u.startsWith(urlstub)) return url; if (urlstub == null || u.startsWith(urlstub)) return url;
} }
} }
}; };

Loading…
Cancel
Save