From 0fbd74920750878a4a51d479de9e6b89ef8399b0 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 16 Jun 2012 15:57:00 +0200 Subject: [PATCH 01/15] ipv6 update --- source/net/yacy/cora/protocol/Domains.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index 4baf0020c..dadeb3005 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -663,7 +663,7 @@ public class Domains { public static final InetAddress parseInetAddress(String ip) { if (ip == null || ip.length() < 8) return null; - if (ip.equals("0:0:0:0:0:0:0:1%0")) ip = "127.0.0.1"; + if (isLocalhost(ip)) ip = "127.0.0.1"; final String[] ips = dotPattern.split(ip); if (ips.length != 4) return null; final byte[] ipb = new byte[4]; @@ -884,7 +884,9 @@ public class Domains { return (noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off "127.0.0.1".equals(host) || "localhost".equals(host) || - host.startsWith("0:0:0:0:0:0:0:1") + host.startsWith("0:0:0:0:0:0:0:1") || + host.startsWith("::1/") || + "::1".equals(host) ); } @@ -908,7 +910,7 @@ public class Domains { // FIXME IPv4 only // check local ip addresses if (matchesList(host, INTRANET_PATTERNS)) return true; - if (host.startsWith("0:0:0:0:0:0:0:1")) return true; + if (isLocalhost(host)) return true; // check if there are other local IP addresses that are not in // the standard IP range From 64c0268b2bee26d70cd0d0acc6a74a3d3fd94151 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 16 Jun 2012 17:40:15 +0200 Subject: [PATCH 02/15] show triplestore metadata in yacydoc and viewfile --- htroot/ViewFile.html | 1 + htroot/ViewFile.java | 2 ++ htroot/api/yacydoc.html | 7 ++++++ htroot/api/yacydoc.java | 22 ++++++++++++++++--- htroot/api/yacydoc.xml | 1 - source/net/yacy/cora/lod/JenaTripleStore.java | 19 ++++++++++++++++ source/net/yacy/document/Autotagging.java | 2 +- source/net/yacy/search/query/RWIProcess.java | 4 +--- 8 files changed, 50 insertions(+), 8 deletions(-) diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html index b93063dba..6c50e32aa 100644 --- a/htroot/ViewFile.html +++ b/htroot/ViewFile.html @@ -42,6 +42,7 @@
Description:
#[desc]#
Size:
#[size]# Bytes
#(mimeTypeAvailable)#::
MimeType:
#[mimeType]#
#(/mimeTypeAvailable)# +
Triplestore:
#[triples]#
:
-
Objectspace
+
Objectspace
+
+
Discover Terms from
object link file name  object page title  object page title (splitted)  object page author
diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index 5a7eeca93..b1899c254 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -35,6 +35,7 @@ import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.LibraryProvider; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; @@ -50,44 +51,69 @@ public class Vocabulary_p { Collection vocs = LibraryProvider.autotagging.getVocabularies(); String vocabularyName = (post == null) ? null : post.get("vocabulary", null); + String discovername = (post == null) ? null : post.get("discovername", null); Tagging vocabulary = vocabularyName == null ? null : LibraryProvider.autotagging.getVocabulary(vocabularyName); if (vocabulary == null) vocabularyName = null; - int count = 0; - for (Tagging v: vocs) { - prop.put("vocabularyset_" + count + "_name", v.getName()); - prop.put("vocabularyset_" + count + "_selected", (vocabularyName != null && vocabularyName.equals(v.getName())) ? 1 : 0); - count++; - } - prop.put("vocabularyset", count); - if (post != null) { try { if (vocabulary == null) { // create a vocabulary - String discovername = post.get("discovername", ""); - if (discovername.length() > 0) { + if (discovername != null && discovername.length() > 0) { String discoverobjectspace = post.get("discoverobjectspace", ""); MultiProtocolURI discoveruri = null; if (discoverobjectspace.length() > 0) try {discoveruri = new MultiProtocolURI(discoverobjectspace);} catch (MalformedURLException e) {} if (discoveruri == null) discoverobjectspace = ""; Map table = new TreeMap(); File propFile = LibraryProvider.autotagging.getVocabularyFile(discovername); + boolean discoverFromPath = post.get("discovermethod", "").equals("path"); + boolean discoverFromTitle = post.get("discovermethod", "").equals("title"); + boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted"); + boolean discoverFromAuthor = post.get("discovermethod", "").equals("author"); if (discoveruri != null) { String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); Segment segment = sb.indexSegments.segment(segmentName); Iterator ui = segment.urlSelector(discoveruri); + String t; while (ui.hasNext()) { DigestURI u = ui.next(); String u0 = u.toNormalform(true, false); - String t = u0.substring(discoverobjectspace.length()); - if (t.indexOf('/') >= 0) continue; - int p = t.indexOf('.'); - if (p >= 0) t = t.substring(0, p); - while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); - while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); - if (p >= 0) t = t.substring(p + 1); + t = ""; + if (discoverFromPath) { + t = u0.substring(discoverobjectspace.length()); + if (t.indexOf('/') >= 0) continue; + int p = t.indexOf('.'); + if (p >= 0) t = t.substring(0, p); + while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1); + while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1); + if (p >= 0) t = t.substring(p + 1); + } + if (discoverFromTitle || discoverFromTitleSplitted) { + URIMetadataRow m = segment.urlMetadata().load(u.hash()); + if (m != null) t = m.dc_title(); + } + if (discoverFromAuthor) { + URIMetadataRow m = segment.urlMetadata().load(u.hash()); + if (m != null) t = m.dc_creator(); + } + t = t.replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll(" ", " ").trim(); if (t.length() == 0) continue; - table.put(t, new Tagging.SOTuple("", u0)); + if (discoverFromTitleSplitted) { + String[] ts = t.split(" "); + for (String s: ts) { + if (s.length() == 0) continue; + table.put(s, new Tagging.SOTuple("", u0)); + } + } else if (discoverFromAuthor) { + String[] ts = t.split(";"); // author names are often separated by ';' + for (String s: ts) { + if (s.length() == 0) continue; + int p = s.indexOf(','); // check if there is a reversed method to mention the name + if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim(); + table.put(s, new Tagging.SOTuple("", u0)); + } + } else { + table.put(t, new Tagging.SOTuple("", u0)); + } } } Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table); @@ -143,6 +169,14 @@ public class Vocabulary_p { } } + int count = 0; + for (Tagging v: vocs) { + prop.put("vocabularyset_" + count + "_name", v.getName()); + prop.put("vocabularyset_" + count + "_selected", ((vocabularyName != null && vocabularyName.equals(v.getName())) || (discovername != null && discovername.equals(v.getName()))) ? 1 : 0); + count++; + } + prop.put("vocabularyset", count); + prop.put("create", vocabularyName == null ? 1 : 0); if (vocabulary == null) { diff --git a/htroot/interaction/Triple.java b/htroot/interaction/Triple.java index 95c1e23f6..58913d028 100644 --- a/htroot/interaction/Triple.java +++ b/htroot/interaction/Triple.java @@ -98,55 +98,37 @@ public class Triple { String s = ""; String p = ""; String o = ""; + String result = ""; Boolean global = false; - if(post != null){ + if (post != null) { - if(post.containsKey("s")){ - s = post.get("s"); - } + s = post.get("s", ""); + p = post.get("p", ""); + o = post.get("o", ""); - if(post.containsKey("sp")){ - s = post.get("sp") + "#" + s; - } - - if(post.containsKey("p")){ - p = post.get("p"); - } - - if(post.containsKey("pp")){ - p = post.get("pp") + "#" + p; - } - - if(post.containsKey("o")){ - o = post.get("o"); - } + if (post.containsKey("sp")) s = post.get("sp") + "#" + s; + if (post.containsKey("pp")) p = post.get("pp") + "#" + p; global = post.containsKey("global"); + if (post.containsKey("load")) { + if (global) { + result = JenaTripleStore.getObject(s, p); + } else { + result = JenaTripleStore.getPrivateObject(s, p, username); + } + } else { + if (global) { + JenaTripleStore.addTriple(s, p, o); + } else { + JenaTripleStore.addTriple(s, p, o, username); + } + } } - if (post.containsKey("load")) { - - if (global) { - o = JenaTripleStore.getObject(s, p); - } else { - o = JenaTripleStore.getPrivateObject(s, p, username); - } - - - } else { - - if (global) { - JenaTripleStore.addTriple(s, p, o); - } else { - JenaTripleStore.addTriple(s, p, o, username); - } - - } - - prop.put("result", o); + prop.put("result", result); return prop; } diff --git a/source/net/yacy/cora/lod/JenaTripleStore.java b/source/net/yacy/cora/lod/JenaTripleStore.java index 7220a8637..40f604f74 100644 --- a/source/net/yacy/cora/lod/JenaTripleStore.java +++ b/source/net/yacy/cora/lod/JenaTripleStore.java @@ -179,11 +179,11 @@ public class JenaTripleStore { } public static String getObject(final String subject, final String predicate) { - Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " ... "); - Iterator ni = JenaTripleStore.getObjects(subject, predicate); - if (!ni.hasNext()) return ""; - return ni.next().toString(); + String object = ""; + if (ni.hasNext()) object = ni.next().toString(); + Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " - " + object); + return object; } public static Iterator getObjects(final String subject, final String predicate) { @@ -192,11 +192,11 @@ public class JenaTripleStore { } public static String getPrivateObject(final String subject, final String predicate, final String username) { - Log.logInfo("TRIPLESTORE", "GET " + subject + " - " + predicate + " ... ("+username+")"); - Iterator ni = JenaTripleStore.getPrivateObjects(subject, predicate, username); - if (!ni.hasNext()) return ""; - return ni.next().toString(); + String object = ""; + if (ni.hasNext()) object = ni.next().toString(); + Log.logInfo("TRIPLESTORE", "GET (" + username + ") " + subject + " - " + predicate + " - " + object); + return object; } private static Iterator getPrivateObjects(final String subject, final String predicate, final String username) { From be928815fcb15132354aa8c217b6610f6e37b3ce Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 17 Jun 2012 17:18:19 +0200 Subject: [PATCH 14/15] fixed wrong parsing of style and script --- .../net/yacy/document/parser/html/ContentScraper.java | 6 ++++-- .../yacy/document/parser/html/TransformerWriter.java | 10 ++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index fe2f8c559..386722b59 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -99,7 +99,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { strong(TagType.pair), i(TagType.pair), li(TagType.pair), - script(TagType.pair); + script(TagType.pair), + style(TagType.pair); public TagType type; private Tag(final TagType type) { @@ -201,6 +202,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { @Override public void scrapeText(final char[] newtext, final String insideTag) { // System.out.println("SCRAPE: " + UTF8.String(newtext)); + if (insideTag != null && ("script".equals(insideTag) || "style".equals(insideTag))) return; int p, pl, q, s = 0; // match evaluation pattern @@ -434,7 +436,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } @Override - public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) { + public void scrapeTag1(final String tagname, final Properties tagopts, char[] text) { // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text)); if (tagname.equalsIgnoreCase("a") && text.length < 2048) { final String href = tagopts.getProperty("href", EMPTY_STRING); diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index e2db4aa77..46c52dcaf 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -202,7 +202,7 @@ public final class TransformerWriter extends Writer { if (tag == null) { // case (1): this is not a tag opener/closer - if (this.scraper != null) this.scraper.scrapeText(content, null); + if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null); if (this.transformer != null) return this.transformer.transformText(content); return content; } @@ -222,7 +222,9 @@ public final class TransformerWriter extends Writer { // we are collection tag text for the tag 'filterTag' -> case (4) - (7) if (tag == null || tag.equals("!")) { // case (4): getting no tag, go on collecting content - if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag); + if (this.scraper != null) { + this.scraper.scrapeText(content, this.filterTag); + } if (this.transformer != null) { this.filterCont.append(this.transformer.transformText(content)); } else { @@ -330,7 +332,7 @@ public final class TransformerWriter extends Writer { if (in[1] == '/') { // a closing tag tagend = tagEnd(in, 2); - tag = new String(in, 2, tagend - 2); + tag = new String(in, 2, tagend - 2).toLowerCase(); final char[] text = new char[in.length - tagend - 1]; System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); return filterTag(tag, false, text, quotechar); @@ -338,7 +340,7 @@ public final class TransformerWriter extends Writer { // an opening tag tagend = tagEnd(in, 1); - tag = new String(in, 1, tagend - 1); + tag = new String(in, 1, tagend - 1).toLowerCase(); final char[] text = new char[in.length - tagend - 1]; System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); return filterTag(tag, true, text, quotechar); From 743b0ec89fa5939adec51e74b9694b3af6400e95 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 17 Jun 2012 17:32:52 +0200 Subject: [PATCH 15/15] - added size of vocabulary to vocabulary view - fixed bad terms in vocabulary-from-titles autogeneration --- htroot/Vocabulary_p.html | 1 + htroot/Vocabulary_p.java | 3 +++ source/net/yacy/cora/lod/vocabulary/Tagging.java | 5 ++--- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index c073b9abf..0e980fd7f 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -107,6 +107,7 @@ function updatepage(str) {
Vocabulary Name
#[name]#
File
#(editable)#[automatically generated, not stored, cannot be edited]::#[file]##(/editable)#
+
Size
#[size]#
Namespace
#[namespace]#
Predicate
#[predicate]#
Prefix
#[prefix]#
diff --git a/htroot/Vocabulary_p.java b/htroot/Vocabulary_p.java index b1899c254..87080a1a4 100644 --- a/htroot/Vocabulary_p.java +++ b/htroot/Vocabulary_p.java @@ -90,6 +90,7 @@ public class Vocabulary_p { if (discoverFromTitle || discoverFromTitleSplitted) { URIMetadataRow m = segment.urlMetadata().load(u.hash()); if (m != null) t = m.dc_title(); + if (t.endsWith(".jpg") || t.endsWith(".gif")) continue; } if (discoverFromAuthor) { URIMetadataRow m = segment.urlMetadata().load(u.hash()); @@ -101,6 +102,7 @@ public class Vocabulary_p { String[] ts = t.split(" "); for (String s: ts) { if (s.length() == 0) continue; + if (s.endsWith(".jpg") || s.endsWith(".gif")) continue; table.put(s, new Tagging.SOTuple("", u0)); } } else if (discoverFromAuthor) { @@ -189,6 +191,7 @@ public class Vocabulary_p { prop.putHTML("edit_editable_file", editable ? vocabulary.getFile().getAbsolutePath() : ""); prop.putHTML("edit_name", vocabulary.getName()); prop.putHTML("edit_namespace", vocabulary.getNamespace()); + prop.put("edit_size", vocabulary.size()); prop.putHTML("edit_predicate", vocabulary.getPredicate()); prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX); prop.putHTML("edit_editable_objectspace", vocabulary.getObjectspace() == null ? "" : vocabulary.getObjectspace()); diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java index 1e0331d74..57604d0aa 100644 --- a/source/net/yacy/cora/lod/vocabulary/Tagging.java +++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java @@ -169,9 +169,8 @@ public class Tagging { } - - public void updateTerm(String term, String[] synonyms) { - + public int size() { + return this.term2objectlink.size(); } private File tmpFile() {