From e89747bb674aebde7cef9a24f3bdce8183ae226d Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 13 Jun 2012 15:53:18 +0200 Subject: [PATCH] - added automated generation of vocabularies from url stubs - added clear of all terms for vocabularies - added deletion of vocabularies --- htroot/Vocabulary_p.html | 42 +++++++-- htroot/Vocabulary_p.java | 91 +++++++++++++++---- .../net/yacy/cora/lod/vocabulary/Tagging.java | 47 ++++++++-- source/net/yacy/document/Autotagging.java | 17 ++++ source/net/yacy/search/index/Segment.java | 64 +++++++++++++ 5 files changed, 225 insertions(+), 36 deletions(-) diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index 5febbdf29..09ce3884c 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -17,26 +17,40 @@
Vocabulary Selection
-
Select Vocabulary
+
Vocabulary Name
- #{vocabularyset}# #{/vocabularyset}#
-
-
+
+ #(create)#:: +
+
Vocabulary Production + It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub. + This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term. + This works best with wikis. Try to use a wiki url as objectspace path. +
+
Vocabulary Name
+
Objectspace
+
+
+
+
+ #(/create)# + #(edit)#::
Vocabulary Editor
+
Vocabulary Name
#[name]#
File
#(editable)#[automatically generated, not stored, cannot be edited]::#[file]##(/editable)#
-
Name
#[name]#
Namespace
#[namespace]#
Predicate
#[predicate]#
Prefix
#[prefix]#
@@ -58,22 +72,30 @@ #{terms}# #(editable)# ::#(/editable)# - #(editable)# ::#(/editable)# + #(editable)# ::#(/editable)# #[term]# #(editable)##[synonyms]#::#(/editable)# #{/terms}# #(editable)#:: - add - + add + + + + clear table (remove all terms) + + + + delete vocabulary + #(/editable)# + +
- -
#(/edit)# diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 85d2d877d..c88775156 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -18,18 +18,26 @@ * If not, see . */ +import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; import java.util.Collection; +import java.util.Iterator; import java.util.Map; +import java.util.TreeMap; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.lod.vocabulary.DCTerms; import net.yacy.cora.lod.vocabulary.Owl; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.LibraryProvider; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; +import net.yacy.search.index.Segment; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -50,30 +58,73 @@ public class Vocabulary_p { } prop.put("vocabularyset", count); - if (post != null && vocabulary != null) { + if (post != null) { try { - // check if objectspace was set - vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace())); + if (vocabulary == null) { + // create a vocabulary + String discovername = post.get("discovername", ""); + String discoverobjectspace = post.get("discoverobjectspace", ""); + MultiProtocolURI discoveruri = null; + if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURI(discoverobjectspace);} catch (MalformedURLException e) {} + if (discovername.length() > 0 && discoveruri != null) { + String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); + Segment segment = sb.indexSegments.segment(segmentName); + Iterator ui = segment.urlSelector(discoveruri); + Map table = new TreeMap(); + File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); + while (ui.hasNext()) { + DigestURI u = ui.next(); + String t = u.toNormalform(false, false).substring(discoverobjectspace.length()); + if (t.indexOf('/') >= 0) continue; + int p = t.indexOf('.'); + if (p >= 0) t = t.substring(0, p); + while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); + while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); + if (p >= 0) t = t.substring(p + 1); + if (t.length() == 0) continue; + table.put(t, ""); + } + if (table.size() > 0) { + Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); + LibraryProvider.autotagging.addVocabulary(newvoc); + vocabulary = newvoc; + } + } + } else { + // check if objectspace was set + vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace())); - // check if a term was added - if (post.get("modify_new", "").equals("checked") && post.get("newterm", "").length() > 0) { - vocabulary.put(post.get("newterm", ""), post.get("newsynonyms", "")); - } + // check if a term was added + if (post.get("add_new", "").equals("checked") && post.get("newterm", "").length() > 0) { + vocabulary.put(post.get("newterm", ""), post.get("newsynonyms", "")); + } - // check if a term was modified - for (Map.Entry e : post.entrySet()) { - if (e.getKey().startsWith("modify_") && e.getValue().equals("checked")) { - String term = e.getKey().substring(7); - String synonyms = post.get("synonyms_" + term, ""); - vocabulary.put(term, synonyms); + // check if a term was modified + for (Map.Entry e : post.entrySet()) { + if (e.getKey().startsWith("modify_") && e.getValue().equals("checked")) { + String term = e.getKey().substring(7); + String synonyms = post.get("synonyms_" + term, ""); + vocabulary.put(term, synonyms); + } } - } - // check if a term shall be deleted - for (Map.Entry e : post.entrySet()) { - if (e.getKey().startsWith("delete_") && e.getValue().equals("checked")) { - String term = e.getKey().substring(7); - vocabulary.delete(term); + // check if a term shall be deleted + for (Map.Entry e : post.entrySet()) { + if (e.getKey().startsWith("delete_") && e.getValue().equals("checked")) { + String term = e.getKey().substring(7); + vocabulary.delete(term); + } + } + + // check if the vocabulary shall be cleared + if (post.get("clear_table", "").equals("checked") ) { + vocabulary.clear(); + } + + // check if the vocabulary shall be deleted + if (vocabulary != null && post.get("delete_vocabulary", "").equals("checked") ) { + LibraryProvider.autotagging.deleteVocabulary(vocabularyName); + vocabulary = null; } } } catch (IOException e) { @@ -81,6 +132,8 @@ public class Vocabulary_p { } } + prop.put("create", vocabularyName == null ? 1 : 0); + if (vocabulary == null) { prop.put("edit", 0); } else { diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java index f2d2ed8f7..efd469eee 100644 --- a/source/net/yacy/cora/lod/vocabulary/Tagging.java +++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java @@ -65,9 +65,30 @@ public class Tagging { public Tagging(String name, File propFile) throws IOException { this(name); this.propFile = propFile; - init(name); + init(); } + /** + * initialize a new Tagging file with a given table and objectspace url stub + * @param name + * @param propFile + * @param objectspace + * @param table + * @throws IOException + */ + public Tagging(String name, File propFile, String objectspace, Map table) throws IOException { + this(name); + this.propFile = propFile; + this.objectspace = objectspace; + BufferedWriter w = new BufferedWriter(new FileWriter(propFile)); + w.write("#objectspace:" + objectspace + "\n"); + for (Map.Entry e: table.entrySet()) { + w.write(e.getKey() + (e.getValue() == null || e.getValue().length() == 0 ? "" : ":" + e.getValue()) + "\n"); + } + w.close(); + init(); + } + public void updateTerm(String term, String[] synonyms) { } @@ -107,7 +128,7 @@ public class Tagging { w.close(); this.propFile.delete(); tmp.renameTo(this.propFile); - init(this.navigatorName); + init(); } public void delete(String term) throws IOException { @@ -135,7 +156,19 @@ public class Tagging { w.close(); this.propFile.delete(); tmp.renameTo(this.propFile); - init(this.navigatorName); + init(); + } + + public void clear() throws IOException { + if (this.propFile == null) return; + File tmp = tmpFile(); + BufferedWriter w = new BufferedWriter(new FileWriter(tmp)); + if (this.namespace != null && !this.namespace.equals(DEFAULT_NAMESPACE)) w.write("#namespace:" + this.namespace + "\n"); + if (this.objectspace != null && this.objectspace.length() > 0) w.write("#objectspace:" + this.objectspace + "\n"); + w.close(); + this.propFile.delete(); + tmp.renameTo(this.propFile); + init(); } public void setObjectspace(String os) throws IOException { @@ -161,7 +194,7 @@ public class Tagging { w.close(); this.propFile.delete(); tmp.renameTo(this.propFile); - init(this.navigatorName); + init(); } public Map> reconstructionSets() { @@ -244,13 +277,13 @@ public class Tagging { return new String[]{line.substring(0, p), line.substring(p + 1)}; } - public void init(String name) throws IOException { + public void init() throws IOException { if (this.propFile == null) return; this.synonym2term.clear(); this.term2synonym.clear(); this.synonym2synonyms.clear(); this.namespace = DEFAULT_NAMESPACE; - this.predicate = this.namespace + name; + this.predicate = this.namespace + this.navigatorName; this.objectspace = null; BlockingQueue list = Files.concurentLineReader(this.propFile, 1000); @@ -267,7 +300,7 @@ public class Tagging { if (comment.startsWith("namespace:")) { this.namespace = comment.substring(10).trim(); if (!this.namespace.endsWith("/") && !this.namespace.endsWith("#") && this.namespace.length() > 0) this.namespace += "#"; - this.predicate = this.namespace + name; + this.predicate = this.namespace + this.navigatorName; } if (comment.startsWith("objectspace:")) { this.objectspace = comment.substring(12).trim(); diff --git a/source/net/yacy/document/Autotagging.java b/source/net/yacy/document/Autotagging.java index 0a01f1458..16594ccdd 100644 --- a/source/net/yacy/document/Autotagging.java +++ b/source/net/yacy/document/Autotagging.java @@ -91,6 +91,16 @@ public class Autotagging { } } + public File getVocabularyFile(String name) { + return new File(this.autotaggingPath, name + ".vocabulary"); + } + + public void deleteVocabulary(String name) { + Tagging v = this.vocabularies.remove(name); + if (v == null) return; + v.getFile().delete(); + } + public Tagging getVocabulary(String name) { return this.vocabularies.get(name); } @@ -103,6 +113,13 @@ public class Autotagging { return this.allTags.keySet(); } + public void addVocabulary(Tagging voc) { + this.vocabularies.put(voc.getName(), voc); + for (String t: voc.tags()) { + this.allTags.put(t, PRESENT); + } + } + public void addDictionaries(Map dictionaries) { for (Map.Entry entry: dictionaries.entrySet()) { Tagging voc = new Tagging(entry.getKey(), entry.getValue()); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index fd90400c6..210a3f280 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -62,6 +62,7 @@ import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.util.ISO639; +import net.yacy.kelondro.util.LookAheadIterator; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.Switchboard; @@ -177,6 +178,69 @@ public class Segment { return this.urlMetadata.exists(urlhash); } + /** + * discover all urls that belong to a specific host + * and return an iterator for the url hashes of those urls + * @param host + * @return an iterator for all url hashes that belong to a specific host + */ + public Iterator hostSelector(String host) { + String hh = DigestURI.hosthash(host); + final HandleSet ref = new HandleSet(12, Base64Order.enhancedCoder, 100); + for (byte[] b: this.urlMetadata) { + if (hh.equals(ASCII.String(b, 6, 6))) { + try { + ref.putUnique(b); + } catch (RowSpaceExceededException e) { + Log.logException(e); + break; + } + } + } + return ref.iterator(); + } + + /** + * discover all urls that start with a given url stub + * @param stub + * @return an iterator for all matching urls + */ + public Iterator urlSelector(MultiProtocolURI stub) { + final String host = stub.getHost(); + final Iterator bi = hostSelector(host); + final String urlstub = stub.toNormalform(false, false); + + // get all urls from the specific domain + final Iterator urls = new Iterator() { + @Override + public boolean hasNext() { + return bi.hasNext(); + } + @Override + public DigestURI next() { + URIMetadataRow umr = Segment.this.urlMetadata.load(bi.next()); + return umr.url(); + } + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + + // now filter the stub from the iterated urls + return new LookAheadIterator() { + @Override + protected DigestURI next0() { + DigestURI u; + while (urls.hasNext()) { + u = urls.next(); + if (u.toNormalform(false, false).startsWith(urlstub)) return u; + } + return null; + } + }; + } + public void clear() { try { this.termIndex.clear();