|
|
|
@ -64,55 +64,58 @@ public class Vocabulary_p {
|
|
|
|
|
if (discoveruri == null) discoverobjectspace = "";
|
|
|
|
|
Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>();
|
|
|
|
|
File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
|
|
|
|
|
boolean discoverNot = post.get("discovermethod", "").equals("none");
|
|
|
|
|
boolean discoverFromPath = post.get("discovermethod", "").equals("path");
|
|
|
|
|
boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
|
|
|
|
|
boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
|
|
|
|
|
boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
|
|
|
|
|
Segment segment = sb.index;
|
|
|
|
|
Iterator<DigestURI> ui = segment.urlSelector(discoveruri, 600000L, 100000);
|
|
|
|
|
String t;
|
|
|
|
|
while (ui.hasNext()) {
|
|
|
|
|
DigestURI u = ui.next();
|
|
|
|
|
String u0 = u.toNormalform(true);
|
|
|
|
|
t = "";
|
|
|
|
|
if (discoverFromPath) {
|
|
|
|
|
int exp = u0.lastIndexOf('.');
|
|
|
|
|
if (exp < 0) continue;
|
|
|
|
|
int slp = u0.lastIndexOf('/', exp);
|
|
|
|
|
if (slp < 0) continue;
|
|
|
|
|
t = u0.substring(slp, exp);
|
|
|
|
|
int p;
|
|
|
|
|
while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
|
|
|
|
|
while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
|
|
|
|
|
}
|
|
|
|
|
if (discoverFromTitle || discoverFromTitleSplitted) {
|
|
|
|
|
URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
|
|
|
|
|
if (m != null) t = m.dc_title();
|
|
|
|
|
if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
|
|
|
|
|
}
|
|
|
|
|
if (discoverFromAuthor) {
|
|
|
|
|
URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
|
|
|
|
|
if (m != null) t = m.dc_creator();
|
|
|
|
|
}
|
|
|
|
|
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();
|
|
|
|
|
if (t.isEmpty()) continue;
|
|
|
|
|
if (discoverFromTitleSplitted) {
|
|
|
|
|
String[] ts = t.split(" ");
|
|
|
|
|
for (String s: ts) {
|
|
|
|
|
if (s.isEmpty()) continue;
|
|
|
|
|
if (s.endsWith(".jpg") || s.endsWith(".gif")) continue;
|
|
|
|
|
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
|
|
|
|
|
if (!discoverNot) {
|
|
|
|
|
Iterator<DigestURI> ui = segment.urlSelector(discoveruri, 600000L, 100000);
|
|
|
|
|
while (ui.hasNext()) {
|
|
|
|
|
DigestURI u = ui.next();
|
|
|
|
|
String u0 = u.toNormalform(true);
|
|
|
|
|
t = "";
|
|
|
|
|
if (discoverFromPath) {
|
|
|
|
|
int exp = u0.lastIndexOf('.');
|
|
|
|
|
if (exp < 0) continue;
|
|
|
|
|
int slp = u0.lastIndexOf('/', exp);
|
|
|
|
|
if (slp < 0) continue;
|
|
|
|
|
t = u0.substring(slp, exp);
|
|
|
|
|
int p;
|
|
|
|
|
while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
|
|
|
|
|
while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
|
|
|
|
|
}
|
|
|
|
|
if (discoverFromTitle || discoverFromTitleSplitted) {
|
|
|
|
|
URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
|
|
|
|
|
if (m != null) t = m.dc_title();
|
|
|
|
|
if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
|
|
|
|
|
}
|
|
|
|
|
if (discoverFromAuthor) {
|
|
|
|
|
URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
|
|
|
|
|
if (m != null) t = m.dc_creator();
|
|
|
|
|
}
|
|
|
|
|
} else if (discoverFromAuthor) {
|
|
|
|
|
String[] ts = t.split(";"); // author names are often separated by ';'
|
|
|
|
|
for (String s: ts) {
|
|
|
|
|
if (s.isEmpty()) continue;
|
|
|
|
|
int p = s.indexOf(','); // check if there is a reversed method to mention the name
|
|
|
|
|
if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
|
|
|
|
|
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
|
|
|
|
|
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();
|
|
|
|
|
if (t.isEmpty()) continue;
|
|
|
|
|
if (discoverFromTitleSplitted) {
|
|
|
|
|
String[] ts = t.split(" ");
|
|
|
|
|
for (String s: ts) {
|
|
|
|
|
if (s.isEmpty()) continue;
|
|
|
|
|
if (s.endsWith(".jpg") || s.endsWith(".gif")) continue;
|
|
|
|
|
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
|
|
|
|
|
}
|
|
|
|
|
} else if (discoverFromAuthor) {
|
|
|
|
|
String[] ts = t.split(";"); // author names are often separated by ';'
|
|
|
|
|
for (String s: ts) {
|
|
|
|
|
if (s.isEmpty()) continue;
|
|
|
|
|
int p = s.indexOf(','); // check if there is a reversed method to mention the name
|
|
|
|
|
if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
|
|
|
|
|
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
|
|
|
|
|