added option to create empty vocabularies

pull/1/head
orbiter 12 years ago
parent d957739441
commit c1b7e61882

@ -109,7 +109,13 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd>
<dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" />
<div id="searchresults"></div></dd>
<dt>Discover Terms from</dt><dd><input type="radio" name="discovermethod" value="path" checked="checked" />object link file name&nbsp;&nbsp;<input type="radio" name="discovermethod" value="title" />object page title&nbsp;&nbsp;<input type="radio" name="discovermethod" value="titlesplitted" />object page title (splitted)&nbsp;&nbsp;<input type="radio" name="discovermethod" value="author" />object page author</dd>
<dt>Discover Terms:</dt>
<dd>
<input type="radio" name="discovermethod" value="none" checked="checked" />no auto-discovery (empty vocabulary)&nbsp;&nbsp;
<input type="radio" name="discovermethod" value="path" />from file name&nbsp;&nbsp;
<input type="radio" name="discovermethod" value="title" />from page title&nbsp;&nbsp;
<input type="radio" name="discovermethod" value="titlesplitted" />from page title (splitted)&nbsp;&nbsp;
<input type="radio" name="discovermethod" value="author" />from page author</dd>
<dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
</dl>
</fieldset>

@ -64,55 +64,58 @@ public class Vocabulary_p {
if (discoveruri == null) discoverobjectspace = "";
Map<String, Tagging.SOTuple> table = new TreeMap<String, Tagging.SOTuple>();
File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
boolean discoverNot = post.get("discovermethod", "").equals("none");
boolean discoverFromPath = post.get("discovermethod", "").equals("path");
boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
Segment segment = sb.index;
Iterator<DigestURI> ui = segment.urlSelector(discoveruri, 600000L, 100000);
String t;
while (ui.hasNext()) {
DigestURI u = ui.next();
String u0 = u.toNormalform(true);
t = "";
if (discoverFromPath) {
int exp = u0.lastIndexOf('.');
if (exp < 0) continue;
int slp = u0.lastIndexOf('/', exp);
if (slp < 0) continue;
t = u0.substring(slp, exp);
int p;
while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
}
if (discoverFromTitle || discoverFromTitleSplitted) {
URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
if (m != null) t = m.dc_title();
if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
}
if (discoverFromAuthor) {
URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
if (m != null) t = m.dc_creator();
}
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();
if (t.isEmpty()) continue;
if (discoverFromTitleSplitted) {
String[] ts = t.split(" ");
for (String s: ts) {
if (s.isEmpty()) continue;
if (s.endsWith(".jpg") || s.endsWith(".gif")) continue;
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
if (!discoverNot) {
Iterator<DigestURI> ui = segment.urlSelector(discoveruri, 600000L, 100000);
while (ui.hasNext()) {
DigestURI u = ui.next();
String u0 = u.toNormalform(true);
t = "";
if (discoverFromPath) {
int exp = u0.lastIndexOf('.');
if (exp < 0) continue;
int slp = u0.lastIndexOf('/', exp);
if (slp < 0) continue;
t = u0.substring(slp, exp);
int p;
while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
}
if (discoverFromTitle || discoverFromTitleSplitted) {
URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
if (m != null) t = m.dc_title();
if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
}
if (discoverFromAuthor) {
URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
if (m != null) t = m.dc_creator();
}
} else if (discoverFromAuthor) {
String[] ts = t.split(";"); // author names are often separated by ';'
for (String s: ts) {
if (s.isEmpty()) continue;
int p = s.indexOf(','); // check if there is a reversed method to mention the name
if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim();
if (t.isEmpty()) continue;
if (discoverFromTitleSplitted) {
String[] ts = t.split(" ");
for (String s: ts) {
if (s.isEmpty()) continue;
if (s.endsWith(".jpg") || s.endsWith(".gif")) continue;
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
}
} else if (discoverFromAuthor) {
String[] ts = t.split(";"); // author names are often separated by ';'
for (String s: ts) {
if (s.isEmpty()) continue;
int p = s.indexOf(','); // check if there is a reversed method to mention the name
if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
}
} else {
table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0));
}
} else {
table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0));
}
}
Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);

Loading…
Cancel
Save