ability to create vocabularies also without any objectspace: this

iterates over all urls in the index do create terms
12 years ago · a2160054d7
parent ecc10a752c
commit a2160054d7
2 changed files with 56 additions and 51 deletions
--- a/htroot/Vocabulary_p.java
+++ b/htroot/Vocabulary_p.java
@ -68,52 +68,51 @@ public class Vocabulary_p {
                        boolean discoverFromTitle = post.get("discovermethod", "").equals("title");
                        boolean discoverFromTitleSplitted = post.get("discovermethod", "").equals("titlesplitted");
                        boolean discoverFromAuthor = post.get("discovermethod", "").equals("author");
-                        if (discoveruri != null) {
-                            Segment segment = sb.index;
-                            Iterator<DigestURI> ui = segment.urlSelector(discoveruri);
-                            String t;
-                            while (ui.hasNext()) {
-                                DigestURI u = ui.next();
-                                String u0 = u.toNormalform(true);
-                                t = "";
-                                if (discoverFromPath) {
-                                    t = u0.substring(discoverobjectspace.length());
-                                    if (t.indexOf('/') >= 0) continue;
-                                    int p = t.indexOf('.');
-                                    if (p >= 0) t = t.substring(0, p);
-                                    while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
-                                    while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
-                                    if (p >= 0) t = t.substring(p + 1);
-                                }
-                                if (discoverFromTitle || discoverFromTitleSplitted) {
-                                    URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
-                                    if (m != null) t = m.dc_title();
-                                    if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
-                                }
-                                if (discoverFromAuthor) {
-                                    URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
-                                    if (m != null) t = m.dc_creator();
+                        Segment segment = sb.index;
+                        Iterator<DigestURI> ui = segment.urlSelector(discoveruri, 600000L, 100000);
+                        String t;
+                        while (ui.hasNext()) {
+                            DigestURI u = ui.next();
+                            String u0 = u.toNormalform(true);
+                            t = "";
+                            if (discoverFromPath) {
+                                int exp = u0.lastIndexOf('.');
+                                if (exp < 0) continue;
+                                int slp = u0.lastIndexOf('/', exp);
+                                if (slp < 0) continue;
+                                t = u0.substring(slp, exp);
+                                int p;
+                                while ((p = t.indexOf(':')) >= 0) t = t.substring(p + 1);
+                                while ((p = t.indexOf('=')) >= 0) t = t.substring(p + 1);
+                            }
+                            if (discoverFromTitle || discoverFromTitleSplitted) {
+                                URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
+                                if (m != null) t = m.dc_title();
+                                if (t.endsWith(".jpg") || t.endsWith(".gif")) continue;
+                            }
+                            if (discoverFromAuthor) {
+                                URIMetadataNode m = segment.fulltext().getMetadata(u.hash());
+                                if (m != null) t = m.dc_creator();
+                            }
+                            t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll("  ", " ").trim();
+                            if (t.isEmpty()) continue;
+                            if (discoverFromTitleSplitted) {
+                                String[] ts = t.split(" ");
+                                for (String s: ts) {
+                                    if (s.isEmpty()) continue;
+                                    if (s.endsWith(".jpg") || s.endsWith(".gif")) continue;
+                                    table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
                                }
-                                t = t.replaceAll("_", " ").replaceAll("\"", " ").replaceAll("'", " ").replaceAll(",", " ").replaceAll("  ", " ").trim();
-                                if (t.isEmpty()) continue;
-                                if (discoverFromTitleSplitted) {
-                                    String[] ts = t.split(" ");
-                                    for (String s: ts) {
-                                        if (s.isEmpty()) continue;
-                                        if (s.endsWith(".jpg") || s.endsWith(".gif")) continue;
-                                        table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
-                                    }
-                                } else if (discoverFromAuthor) {
-                                    String[] ts = t.split(";"); // author names are often separated by ';'
-                                    for (String s: ts) {
-                                        if (s.isEmpty()) continue;
-                                        int p = s.indexOf(','); // check if there is a reversed method to mention the name
-                                        if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
-                                        table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
-                                    }
-                                } else {
-                                    table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0));
+                            } else if (discoverFromAuthor) {
+                                String[] ts = t.split(";"); // author names are often separated by ';'
+                                for (String s: ts) {
+                                    if (s.isEmpty()) continue;
+                                    int p = s.indexOf(','); // check if there is a reversed method to mention the name
+                                    if (p >= 0) s = s.substring(p + 1).trim() + " " + s.substring(0, p).trim();
+                                    table.put(s, new Tagging.SOTuple(Tagging.normalizeTerm(s), u0));
                                }
+                            } else {
+                                table.put(t, new Tagging.SOTuple(Tagging.normalizeTerm(t), u0));
                            }
                        }
                        Tagging newvoc = new Tagging(discovername, propFile, discoverobjectspace, table);
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -219,12 +219,18 @@ public class Segment {
     * @param stub
     * @return an iterator for all matching urls
     */
-    public Iterator<DigestURI> urlSelector(MultiProtocolURI stub) {
-        final String host = stub.getHost();
-        String hh = DigestURI.hosthash(host);
-        final BlockingQueue<SolrDocument> docQueue = this.fulltext.getSolr().concurrentQuery(YaCySchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, 600000L, 100000, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName());
-
-        final String urlstub = stub.toNormalform(true);
+    public Iterator<DigestURI> urlSelector(final MultiProtocolURI stub, final long maxtime, final int maxcount) {
+        final BlockingQueue<SolrDocument> docQueue;
+        final String urlstub;
+        if (stub == null) {
+            docQueue = this.fulltext.getSolr().concurrentQuery("*:*", 0, Integer.MAX_VALUE, maxtime, maxcount, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName());
+            urlstub = null;
+        } else {
+            final String host = stub.getHost();
+            String hh = DigestURI.hosthash(host);
+            docQueue = this.fulltext.getSolr().concurrentQuery(YaCySchema.host_id_s + ":\"" + hh + "\"", 0, Integer.MAX_VALUE, maxtime, maxcount, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName());
+            urlstub = stub.toNormalform(true);
+        }

        // now filter the stub from the iterated urls
        return new LookAheadIterator<DigestURI>() {
@ -247,7 +253,7 @@ public class Segment {
                    } catch (MalformedURLException e) {
                        continue;
                    }
-                    if (u.startsWith(urlstub)) return url;
+                    if (urlstub == null || u.startsWith(urlstub)) return url;
                }
            }
        };