- added CamelCase parser to MultiProtocolURI: generate better to-be-indexed words from urls

- integrated new parser into loader processes: enrich document parser - fixed a concurrent modification exception in kelondro iterator - hand-over of document size from crawler to indexer git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7374 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 56264dcc17
parent 358feeeb39
commit 56264dcc17
9 changed files with 105 additions and 10 deletions
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -312,8 +312,8 @@ public final class CrawlStacker {
            if (protocol.equals("http") || protocol.equals("https")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
            if (protocol.equals("ftp")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.ftp.maxFileSize", FTPLoader.DEFAULT_MAXFILESIZE);
            if (protocol.equals("smb")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.smb.maxFileSize", SMBLoader.DEFAULT_MAXFILESIZE);
-
        }
+
        // check availability of parser and maxfilesize
        if (entry.size() > maxFileSize ||
            (entry.url().getFileExtension().length() > 0 && TextParser.supports(entry.url(), null) != null)
--- a/source/de/anomic/crawler/retrieval/FTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/FTPLoader.java
@ -252,7 +252,7 @@ public class FTPLoader {
                    responseHeader,
                    "200",
                    mp == null ? null : new CrawlProfile(mp),
-                    url.toNormalform(true, true).getBytes());
+                    url.toTokens().getBytes());
            return response;
        }
        
--- a/source/de/anomic/crawler/retrieval/FileLoader.java
+++ b/source/de/anomic/crawler/retrieval/FileLoader.java
@ -133,7 +133,7 @@ public class FileLoader {
                    responseHeader,
                    "200",
                    mp == null ? null : new CrawlProfile(mp),
-                    url.toNormalform(true, true).getBytes());
+                    url.toTokens().getBytes());
            return response;
        }
        
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
@ -165,10 +165,11 @@ public class Response {
        // request and response headers may be zero in case that we process surrogates
        this.requestHeader = new RequestHeader();
        this.responseHeader = new ResponseHeader();
+        if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
        this.responseStatus = "200";
        this.profile = profile;
        this.status = QUEUE_STATE_FRESH;
-        this.content = request.url().toNormalform(true, true).getBytes();
+        this.content = request.url().toTokens().getBytes();
    }
    
    public Response(
--- a/source/de/anomic/crawler/retrieval/SMBLoader.java
+++ b/source/de/anomic/crawler/retrieval/SMBLoader.java
@ -154,7 +154,7 @@ public class SMBLoader {
                    responseHeader,
                    "200",
                    mp == null ? null : new CrawlProfile(mp),
-                    url.toNormalform(true, true).getBytes());
+                    url.toTokens().getBytes());
            return response;
        }
        
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -1833,7 +1833,7 @@ public final class Switchboard extends serverSwitch {
            doclist.add(document);
        }
        
-        if (doclist.isEmpty())  return new indexingQueueEntry(in.process, in.queueEntry, in.documents, null);
+        if (doclist.isEmpty()) return new indexingQueueEntry(in.process, in.queueEntry, in.documents, null);
        in.documents = doclist.toArray(new Document[doclist.size()]);
        Condenser[] condenser = new Condenser[in.documents.length];
        if (this.log.isFine()) log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'");
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@ -30,7 +30,9 @@ import java.io.InputStream;
 import java.io.Serializable;
 import java.net.MalformedURLException;
 import java.text.Collator;
+import java.util.LinkedHashMap;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.regex.Matcher;
@ -770,6 +772,80 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
    public String toString() {
        return toNormalform(false, true);
    }
+
+    public String toTokens() {
+        return toTokens(this.toNormalform(true, true));
+    }
+    
+    private final static String[] replacementStrings = {"%20", "%2B", "%2b"};
+    
+    /**
+     * create word tokens for parser. Find CamelCases and separate these words
+     * resulting words are not ordered by appearance, but all
+     * @return
+     */
+    public static String toTokens(String s) {
+        String t = new String(s);
+        
+        // remove all replacement strings
+        for (String r: replacementStrings) t = t.replaceAll(r, " ");
+
+        // remove all non-character & non-number
+        StringBuilder sb = new StringBuilder(t.length());
+        char c;
+        for (int i = 0; i < t.length(); i++) {
+            c = t.charAt(i);
+            if ((c >= '0' && c <='9') || (c >= 'a' && c <='z') || (c >= 'A' && c <='Z')) sb.append(c); else sb.append(' ');
+        }
+        t = sb.toString();
+        
+        // remove all double-spaces
+        int p;
+        while ((p = t.indexOf("  ")) >= 0) t = t.substring(0, p) + t.substring(p + 1);
+
+        // split the string into tokens and add all camel-case splitting
+        String[] u = t.split(" ");
+        Map<String, Object> token = new LinkedHashMap<String, Object>();
+        for (String r: u) {
+            token.putAll(parseCamelCase(r));
+        }
+        
+        // construct a String again
+        for (String v: token.keySet()) if (v.length() > 1) s += " " + v;
+        return s;
+    }
+    
+    public static enum CharType { low, high, number; }
+    
+    public static Map<String, Object> parseCamelCase(String s) {
+        Map<String, Object> token = new LinkedHashMap<String, Object>();
+        if (s.length() == 0) return token;
+        int p = 0;
+        CharType type = charType(s.charAt(0)), nct = type;
+        while (p < s.length()) {
+            // search for first appearance of an character that is a upper-case
+            while (p < s.length() && (nct = charType(s.charAt(p))) == type) p++;
+            if (p >= s.length()) { token.put(s, new Object()); break; }
+            if (nct == CharType.low) {
+                type = CharType.low;
+                p++; continue;
+            }
+            
+            // the char type has changed
+            token.put(s.substring(0, p), new Object());
+            s = s.substring(p);
+            p = 0;
+            type = nct;
+        }
+        token.put(s, new Object());
+        return token;
+    }
+    
+    private static CharType charType(char c) {
+        if (c >= 'a' && c <= 'z') return CharType.low;
+        if (c >= '0' && c <= '1') return CharType.number;
+        return CharType.high;
+    }
    
    public String toNormalform(final boolean excludeReference, final boolean stripAmp) {
        return toNormalform(excludeReference, stripAmp, false);
@ -1105,6 +1181,11 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
        return splitpattern.split(normalizedURL.toLowerCase()); // word components of the url
    }

+    public static void main(final String[] args) {
+        for (String s: args) System.out.println(toTokens(s));
+    }
+
+    /*
    public static void main(final String[] args) {
        final String[][] test = new String[][]{
          new String[]{null, "C:WINDOWS\\CMD0.EXE"},
@ -1191,5 +1272,6 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
            }
        }
    }
+    */

 }
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -252,7 +252,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                final String f = url.getFile();
                final int p = f.lastIndexOf('.');
                final String type = (p < 0) ? "" : f.substring(p + 1);
-                if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg")) {
+                if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) {
                    // special handling of such urls: put them to the image urls
                    final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
                    addImage(images, ie);
--- a/source/net/yacy/kelondro/util/ReverseMapIterator.java
+++ b/source/net/yacy/kelondro/util/ReverseMapIterator.java
@ -27,6 +27,7 @@
 package net.yacy.kelondro.util;

 import java.util.ArrayList;
+import java.util.ConcurrentModificationException;
 import java.util.Iterator;
 import java.util.Map;

@ -36,9 +37,20 @@ public class ReverseMapIterator <E, F> implements Iterator<Map.Entry<E, F>> {
    E last;

    public ReverseMapIterator(Map<E, F> map) {
-        this.map = map;
-        this.a = new ArrayList<E>();
-        for (E e: map.keySet()) a.add(e);
+        synchronized (map) {
+            this.map = map;
+            this.a = new ArrayList<E>();
+            while (true) {
+                try {
+                    for (E e: map.keySet()) {
+                        a.add(e);
+                    }
+                    break;
+                } catch (ConcurrentModificationException e) {
+                    continue;
+                }
+            }
+        }
    }
    
    public boolean hasNext() {