- added automated generation of vocabularies from url stubs

- added clear of all terms for vocabularies
- added deletion of vocabularies
pull/1/head
Michael Peter Christen 13 years ago
parent 79464189a4
commit e89747bb67

@ -17,26 +17,40 @@
<form action="Vocabulary_p.html" method="get" accept-charset="UTF-8">
<fieldset><legend>Vocabulary Selection</legend>
<dl>
<dt>Select Vocabulary</dt>
<dt>Vocabulary Name</dt>
<dd>
<select name="vocabulary" onchange='this.form.submit()'>
<select name="vocabulary" onchange="this.form.submit()">
#{vocabularyset}#
<option value="#[name]#" #(selected)#::selected="selected"#(/selected)#>#[name]#</option>
#{/vocabularyset}#
</select>
</dd>
<dt></dt>
<dd><input type="submit" name="view" value="View" /></dd>
<dt></dt><dd><input type="submit" name="view" value="View" /></dd>
</dl>
</fieldset>
</form>
#(create)#::
<form action="Vocabulary_p.html" method="get" accept-charset="UTF-8">
<fieldset><legend>Vocabulary Production</legend>
It is possible to produce a vocabulary out of the existing search index. This is done using a given 'objectspace' which you can enter as a URL Stub.
This stub is used to find all matching URLs. If the remaining path from the matching URLs then denotes a single file, the file name is used as vocabulary term.
This works best with wikis. Try to use a wiki url as objectspace path.
<dl>
<dt>Vocabulary Name</dt><dd><input type="text" name="discovername" value="" size="16" maxlength="128" /></dd>
<dt>Objectspace</dt><dd><input type="text" name="discoverobjectspace" value="http://" size="78" maxlength="128" /></dd>
<dt></dt><dd><input type="submit" name="create" value="Create" /></dd>
</dl>
</fieldset>
</form>
#(/create)#
#(edit)#::
<form action="Vocabulary_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Vocabulary Editor</legend>
<dl>
<dt>Vocabulary Name</dt><dd>#[name]#</dd>
<dt>File</dt><dd>#(editable)#[automatically generated, not stored, cannot be edited]::#[file]##(/editable)#</dd>
<dt>Name</dt><dd>#[name]#</dd>
<dt>Namespace</dt><dd>#[namespace]#</dd>
<dt>Predicate</dt><dd>#[predicate]#</dd>
<dt>Prefix</dt><dd>#[prefix]#</dd>
@ -58,22 +72,30 @@
#{terms}#
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td align="center">#(editable)#&nbsp;::<input type="checkbox" name="modify_#[term]#" id="modify_#[term]#" value="checked" disabled="disabled"/>#(/editable)#</td>
<td align="center">#(editable)#&nbsp;::<input type="checkbox" name="delete_#[term]#" id="delete_#[term]#" value="checked" onchange='this.form.submit()'/>#(/editable)#</td>
<td align="center">#(editable)#&nbsp;::<input type="checkbox" name="delete_#[term]#" id="delete_#[term]#" value="checked" onchange="this.form.submit()"/>#(/editable)#</td>
<td align="left">#[term]#</td>
<td align="left">#(editable)##[synonyms]#::<input type="text" name="synonyms_#[term]#" value="#[synonyms]#" size="80" maxlength="1024" onclick="document.getElementById('modify_#[term]#').checked='checked'; document.getElementById('modify_#[term]#').disabled=''"/>#(/editable)#</td>
</tr>
#{/terms}#
#(editable)#::
<tr class="TableCellDark">
<td align="center" colspan="2"><input type="checkbox" name="modify_new" id="modify_new" value="checked" disabled="disabled"/><i>add</i></td>
<td align="left"><input type="text" name="newterm" id="newterm" value="" size="24" maxlength="128" onclick="document.getElementById('modify_new').checked='checked'; document.getElementById('modify_new').disabled=''"/></td>
<td align="center" colspan="2"><input type="checkbox" name="add_new" id="add_new" value="checked" disabled="disabled"/><i>add</i></td>
<td align="left"><input type="text" name="newterm" id="newterm" value="" size="24" maxlength="128" onclick="document.getElementById('add_new').checked='checked'; document.getElementById('add_new').disabled=''"/></td>
<td align="left"><input type="text" name="newsynonyms" id="newsynonyms" value="" size="80" maxlength="1024"/></td>
</tr>
<tr class="TableCellDark">
<td colspan="3"></td>
<td align="right" class="TableCellSummary"><input type="checkbox" name="clear_table" id="clear_table" value="checked" /><i>clear table (remove all terms)</i></td>
</tr>
<tr class="TableCellDark">
<td colspan="3"></td>
<td align="right" class="TableCellSummary"><input type="checkbox" name="delete_vocabulary" id="delete_vocabulary" value="checked" /><i>delete vocabulary</i></td>
</tr>
#(/editable)#
</table>
<input type="hidden" name="vocabulary" value="#[name]#" />
<input type="submit" name="set" value="Submit" />
</fieldset>
<input type="hidden" name="vocabulary" value="#[name]#" />
<input type="submit" name="set" value="Set" />
</form>
#(/edit)#

@ -18,18 +18,26 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.lod.vocabulary.DCTerms;
import net.yacy.cora.lod.vocabulary.Owl;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -50,30 +58,73 @@ public class Vocabulary_p {
}
prop.put("vocabularyset", count);
if (post != null && vocabulary != null) {
if (post != null) {
try {
// check if objectspace was set
vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()));
if (vocabulary == null) {
// create a vocabulary
String discovername = post.get("discovername", "");
String discoverobjectspace = post.get("discoverobjectspace", "");
MultiProtocolURI discoveruri = null;
if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURI(discoverobjectspace);} catch (MalformedURLException e) {}
if (discovername.length() > 0 && discoveruri != null) {
String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
Segment segment = sb.indexSegments.segment(segmentName);
Iterator<DigestURI> ui = segment.urlSelector(discoveruri);
Map<String,String> table = new TreeMap<String, String>();
File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername);
while (ui.hasNext()) {
DigestURI u = ui.next();
String t = u.toNormalform(false, false).substring(discoverobjectspace.length());
if (t.indexOf('/') >= 0) continue;
int p = t.indexOf('.');
if (p >= 0) t = t.substring(0, p);
while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
if (p >= 0) t = t.substring(p + 1);
if (t.length() == 0) continue;
table.put(t, "");
}
if (table.size() > 0) {
Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
LibraryProvider.autotagging.addVocabulary(newvoc);
vocabulary = newvoc;
}
}
} else {
// check if objectspace was set
vocabulary.setObjectspace(post.get("objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()));
// check if a term was added
if (post.get("modify_new", "").equals("checked") && post.get("newterm", "").length() > 0) {
vocabulary.put(post.get("newterm", ""), post.get("newsynonyms", ""));
}
// check if a term was added
if (post.get("add_new", "").equals("checked") && post.get("newterm", "").length() > 0) {
vocabulary.put(post.get("newterm", ""), post.get("newsynonyms", ""));
}
// check if a term was modified
for (Map.Entry<String, String> e : post.entrySet()) {
if (e.getKey().startsWith("modify_") && e.getValue().equals("checked")) {
String term = e.getKey().substring(7);
String synonyms = post.get("synonyms_" + term, "");
vocabulary.put(term, synonyms);
// check if a term was modified
for (Map.Entry<String, String> e : post.entrySet()) {
if (e.getKey().startsWith("modify_") && e.getValue().equals("checked")) {
String term = e.getKey().substring(7);
String synonyms = post.get("synonyms_" + term, "");
vocabulary.put(term, synonyms);
}
}
}
// check if a term shall be deleted
for (Map.Entry<String, String> e : post.entrySet()) {
if (e.getKey().startsWith("delete_") && e.getValue().equals("checked")) {
String term = e.getKey().substring(7);
vocabulary.delete(term);
// check if a term shall be deleted
for (Map.Entry<String, String> e : post.entrySet()) {
if (e.getKey().startsWith("delete_") && e.getValue().equals("checked")) {
String term = e.getKey().substring(7);
vocabulary.delete(term);
}
}
// check if the vocabulary shall be cleared
if (post.get("clear_table", "").equals("checked") ) {
vocabulary.clear();
}
// check if the vocabulary shall be deleted
if (vocabulary != null && post.get("delete_vocabulary", "").equals("checked") ) {
LibraryProvider.autotagging.deleteVocabulary(vocabularyName);
vocabulary = null;
}
}
} catch (IOException e) {
@ -81,6 +132,8 @@ public class Vocabulary_p {
}
}
prop.put("create", vocabularyName == null ? 1 : 0);
if (vocabulary == null) {
prop.put("edit", 0);
} else {

@ -65,9 +65,30 @@ public class Tagging {
public Tagging(String name, File propFile) throws IOException {
this(name);
this.propFile = propFile;
init(name);
init();
}
/**
* initialize a new Tagging file with a given table and objectspace url stub
* @param name
* @param propFile
* @param objectspace
* @param table
* @throws IOException
*/
public Tagging(String name, File propFile, String objectspace, Map<String,String> table) throws IOException {
this(name);
this.propFile = propFile;
this.objectspace = objectspace;
BufferedWriter w = new BufferedWriter(new FileWriter(propFile));
w.write("#objectspace:" + objectspace + "\n");
for (Map.Entry<String, String> e: table.entrySet()) {
w.write(e.getKey() + (e.getValue() == null || e.getValue().length() == 0 ? "" : ":" + e.getValue()) + "\n");
}
w.close();
init();
}
public void updateTerm(String term, String[] synonyms) {
}
@ -107,7 +128,7 @@ public class Tagging {
w.close();
this.propFile.delete();
tmp.renameTo(this.propFile);
init(this.navigatorName);
init();
}
public void delete(String term) throws IOException {
@ -135,7 +156,19 @@ public class Tagging {
w.close();
this.propFile.delete();
tmp.renameTo(this.propFile);
init(this.navigatorName);
init();
}
public void clear() throws IOException {
if (this.propFile == null) return;
File tmp = tmpFile();
BufferedWriter w = new BufferedWriter(new FileWriter(tmp));
if (this.namespace != null && !this.namespace.equals(DEFAULT_NAMESPACE)) w.write("#namespace:" + this.namespace + "\n");
if (this.objectspace != null && this.objectspace.length() > 0) w.write("#objectspace:" + this.objectspace + "\n");
w.close();
this.propFile.delete();
tmp.renameTo(this.propFile);
init();
}
public void setObjectspace(String os) throws IOException {
@ -161,7 +194,7 @@ public class Tagging {
w.close();
this.propFile.delete();
tmp.renameTo(this.propFile);
init(this.navigatorName);
init();
}
public Map<String, Set<String>> reconstructionSets() {
@ -244,13 +277,13 @@ public class Tagging {
return new String[]{line.substring(0, p), line.substring(p + 1)};
}
public void init(String name) throws IOException {
public void init() throws IOException {
if (this.propFile == null) return;
this.synonym2term.clear();
this.term2synonym.clear();
this.synonym2synonyms.clear();
this.namespace = DEFAULT_NAMESPACE;
this.predicate = this.namespace + name;
this.predicate = this.namespace + this.navigatorName;
this.objectspace = null;
BlockingQueue<String> list = Files.concurentLineReader(this.propFile, 1000);
@ -267,7 +300,7 @@ public class Tagging {
if (comment.startsWith("namespace:")) {
this.namespace = comment.substring(10).trim();
if (!this.namespace.endsWith("/") && !this.namespace.endsWith("#") && this.namespace.length() > 0) this.namespace += "#";
this.predicate = this.namespace + name;
this.predicate = this.namespace + this.navigatorName;
}
if (comment.startsWith("objectspace:")) {
this.objectspace = comment.substring(12).trim();

@ -91,6 +91,16 @@ public class Autotagging {
}
}
public File getVocabularyFile(String name) {
return new File(this.autotaggingPath, name + ".vocabulary");
}
public void deleteVocabulary(String name) {
Tagging v = this.vocabularies.remove(name);
if (v == null) return;
v.getFile().delete();
}
public Tagging getVocabulary(String name) {
return this.vocabularies.get(name);
}
@ -103,6 +113,13 @@ public class Autotagging {
return this.allTags.keySet();
}
public void addVocabulary(Tagging voc) {
this.vocabularies.put(voc.getName(), voc);
for (String t: voc.tags()) {
this.allTags.put(t, PRESENT);
}
}
public void addDictionaries(Map<String, Dictionary> dictionaries) {
for (Map.Entry<String, Dictionary> entry: dictionaries.entrySet()) {
Tagging voc = new Tagging(entry.getKey(), entry.getValue());

@ -62,6 +62,7 @@ import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.ISO639;
import net.yacy.kelondro.util.LookAheadIterator;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.Switchboard;
@ -177,6 +178,69 @@ public class Segment {
return this.urlMetadata.exists(urlhash);
}
/**
* discover all urls that belong to a specific host
* and return an iterator for the url hashes of those urls
* @param host
* @return an iterator for all url hashes that belong to a specific host
*/
public Iterator<byte[]> hostSelector(String host) {
String hh = DigestURI.hosthash(host);
final HandleSet ref = new HandleSet(12, Base64Order.enhancedCoder, 100);
for (byte[] b: this.urlMetadata) {
if (hh.equals(ASCII.String(b, 6, 6))) {
try {
ref.putUnique(b);
} catch (RowSpaceExceededException e) {
Log.logException(e);
break;
}
}
}
return ref.iterator();
}
/**
* discover all urls that start with a given url stub
* @param stub
* @return an iterator for all matching urls
*/
public Iterator<DigestURI> urlSelector(MultiProtocolURI stub) {
final String host = stub.getHost();
final Iterator<byte[]> bi = hostSelector(host);
final String urlstub = stub.toNormalform(false, false);
// get all urls from the specific domain
final Iterator<DigestURI> urls = new Iterator<DigestURI>() {
@Override
public boolean hasNext() {
return bi.hasNext();
}
@Override
public DigestURI next() {
URIMetadataRow umr = Segment.this.urlMetadata.load(bi.next());
return umr.url();
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
// now filter the stub from the iterated urls
return new LookAheadIterator<DigestURI>() {
@Override
protected DigestURI next0() {
DigestURI u;
while (urls.hasNext()) {
u = urls.next();
if (u.toNormalform(false, false).startsWith(urlstub)) return u;
}
return null;
}
};
}
public void clear() {
try {
this.termIndex.clear();

Loading…
Cancel
Save