diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index 30226a0ca..91c272d28 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -109,7 +109,13 @@ To see a list of all APIs, please visit the
Objectspace
-
Discover Terms from
object link file name  object page title  object page title (splitted)  object page author
+
Discover Terms:
+
+ no auto-discovery (empty vocabulary)   + from file name   + from page title   + from page title (splitted)   + from page author
diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 2f6ceaa56..a3604a177 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -64,55 +64,58 @@ public class Vocabulary_p { if (discoveruri == null) discoverobjectspace = ""; Map table = new TreeMap(); File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); + boolean discoverNot = post.get("discovermethod", "").equals("none"); boolean discoverFromPath = post.get("discovermethod", "").equals("path"); boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); Segment segment = sb.index; - Iterator ui = segment.urlSelector(discoveruri, 600000L, 100000); String t; - while (ui.hasNext()) { - DigestURI u = ui.next(); - String u0 = u.toNormalform(true); - t = ""; - if (discoverFromPath) { - int exp = u0.lastIndexOf('.'); - if (exp < 0) continue; - int slp = u0.lastIndexOf('/', exp); - if (slp < 0) continue; - t = u0.substring(slp, exp); - int p; - while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); - while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); - } - if (discoverFromTitle || discoverFromTitleSplitted) { - URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); - if (m != null) t = m.dc_title(); - if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; - } - if (discoverFromAuthor) { - URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); - if (m != null) t = m.dc_creator(); - } - t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); - if (t.isEmpty()) continue; - if (discoverFromTitleSplitted) { - String[] ts = t.split(" "); - for (String s: ts) { - if (s.isEmpty()) continue; - if (s.endsWith(".jpg") || s.endsWith(".gif")) continue; - table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); + if (!discoverNot) { + Iterator ui = segment.urlSelector(discoveruri, 600000L, 100000); + while (ui.hasNext()) { + DigestURI u = ui.next(); + String u0 = u.toNormalform(true); + t = ""; + if (discoverFromPath) { + int exp = u0.lastIndexOf('.'); + if (exp < 0) continue; + int slp = u0.lastIndexOf('/', exp); + if (slp < 0) continue; + t = u0.substring(slp, exp); + int p; + while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); + while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); + } + if (discoverFromTitle || discoverFromTitleSplitted) { + URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); + if (m != null) t = m.dc_title(); + if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; + } + if (discoverFromAuthor) { + URIMetadataNode m = segment.fulltext().getMetadata(u.hash()); + if (m != null) t = m.dc_creator(); } - } else if (discoverFromAuthor) { - String[] ts = t.split(";"); // author names are often separated by ';' - for (String s: ts) { - if (s.isEmpty()) continue; - int p = s.indexOf(','); // check if there is a reversed method to mention the name - if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim(); - table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); + t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); + if (t.isEmpty()) continue; + if (discoverFromTitleSplitted) { + String[] ts = t.split(" "); + for (String s: ts) { + if (s.isEmpty()) continue; + if (s.endsWith(".jpg") || s.endsWith(".gif")) continue; + table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); + } + } else if (discoverFromAuthor) { + String[] ts = t.split(";"); // author names are often separated by ';' + for (String s: ts) { + if (s.isEmpty()) continue; + int p = s.indexOf(','); // check if there is a reversed method to mention the name + if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim(); + table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0)); + } + } else { + table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0)); } - } else { - table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0)); } } Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);