From 56264dcc17cb7a5cc0e2a3ca3f6a9c49bc81365f Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 15 Dec 2010 00:03:19 +0000 Subject: [PATCH] - added CamelCase parser to MultiProtocolURI: generate better to-be-indexed words from urls - integrated new parser into loader processes: enrich document parser - fixed a concurrent modification exception in kelondro iterator - hand-over of document size from crawler to indexer git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7374 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/CrawlStacker.java | 2 +- .../anomic/crawler/retrieval/FTPLoader.java | 2 +- .../anomic/crawler/retrieval/FileLoader.java | 2 +- .../de/anomic/crawler/retrieval/Response.java | 3 +- .../anomic/crawler/retrieval/SMBLoader.java | 2 +- source/de/anomic/search/Switchboard.java | 2 +- .../yacy/cora/document/MultiProtocolURI.java | 82 +++++++++++++++++++ .../document/parser/html/ContentScraper.java | 2 +- .../kelondro/util/ReverseMapIterator.java | 18 +++- 9 files changed, 105 insertions(+), 10 deletions(-) diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 514855552..9706e255e 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -312,8 +312,8 @@ public final class CrawlStacker { if (protocol.equals("http") || protocol.equals("https")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); if (protocol.equals("ftp")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.ftp.maxFileSize", FTPLoader.DEFAULT_MAXFILESIZE); if (protocol.equals("smb")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.smb.maxFileSize", SMBLoader.DEFAULT_MAXFILESIZE); - } + // check availability of parser and maxfilesize if (entry.size() > maxFileSize || (entry.url().getFileExtension().length() > 0 && TextParser.supports(entry.url(), null) != null) diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 9a598e054..cff12bd18 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -252,7 +252,7 @@ public class FTPLoader { responseHeader, "200", mp == null ? null : new CrawlProfile(mp), - url.toNormalform(true, true).getBytes()); + url.toTokens().getBytes()); return response; } diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java index 238845e5d..e7eb2caba 100644 --- a/source/de/anomic/crawler/retrieval/FileLoader.java +++ b/source/de/anomic/crawler/retrieval/FileLoader.java @@ -133,7 +133,7 @@ public class FileLoader { responseHeader, "200", mp == null ? null : new CrawlProfile(mp), - url.toNormalform(true, true).getBytes()); + url.toTokens().getBytes()); return response; } diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 306c304d3..0fec79f27 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -165,10 +165,11 @@ public class Response { // request and response headers may be zero in case that we process surrogates this.requestHeader = new RequestHeader(); this.responseHeader = new ResponseHeader(); + if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size())); this.responseStatus = "200"; this.profile = profile; this.status = QUEUE_STATE_FRESH; - this.content = request.url().toNormalform(true, true).getBytes(); + this.content = request.url().toTokens().getBytes(); } public Response( diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index d58f3f81a..eedf2a7a2 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -154,7 +154,7 @@ public class SMBLoader { responseHeader, "200", mp == null ? null : new CrawlProfile(mp), - url.toNormalform(true, true).getBytes()); + url.toTokens().getBytes()); return response; } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 18ffe3a36..2f91f57cf 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1833,7 +1833,7 @@ public final class Switchboard extends serverSwitch { doclist.add(document); } - if (doclist.isEmpty()) return new indexingQueueEntry(in.process, in.queueEntry, in.documents, null); + if (doclist.isEmpty()) return new indexingQueueEntry(in.process, in.queueEntry, in.documents, null); in.documents = doclist.toArray(new Document[doclist.size()]); Condenser[] condenser = new Condenser[in.documents.length]; if (this.log.isFine()) log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'"); diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 4d0a31a08..8f42e44e5 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -30,7 +30,9 @@ import java.io.InputStream; import java.io.Serializable; import java.net.MalformedURLException; import java.text.Collator; +import java.util.LinkedHashMap; import java.util.Locale; +import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; @@ -770,6 +772,80 @@ public class MultiProtocolURI implements Serializable, Comparable= '0' && c <='9') || (c >= 'a' && c <='z') || (c >= 'A' && c <='Z')) sb.append(c); else sb.append(' '); + } + t = sb.toString(); + + // remove all double-spaces + int p; + while ((p = t.indexOf(" ")) >= 0) t = t.substring(0, p) + t.substring(p + 1); + + // split the string into tokens and add all camel-case splitting + String[] u = t.split(" "); + Map token = new LinkedHashMap(); + for (String r: u) { + token.putAll(parseCamelCase(r)); + } + + // construct a String again + for (String v: token.keySet()) if (v.length() > 1) s += " " + v; + return s; + } + + public static enum CharType { low, high, number; } + + public static Map parseCamelCase(String s) { + Map token = new LinkedHashMap(); + if (s.length() == 0) return token; + int p = 0; + CharType type = charType(s.charAt(0)), nct = type; + while (p < s.length()) { + // search for first appearance of an character that is a upper-case + while (p < s.length() && (nct = charType(s.charAt(p))) == type) p++; + if (p >= s.length()) { token.put(s, new Object()); break; } + if (nct == CharType.low) { + type = CharType.low; + p++; continue; + } + + // the char type has changed + token.put(s.substring(0, p), new Object()); + s = s.substring(p); + p = 0; + type = nct; + } + token.put(s, new Object()); + return token; + } + + private static CharType charType(char c) { + if (c >= 'a' && c <= 'z') return CharType.low; + if (c >= '0' && c <= '1') return CharType.number; + return CharType.high; + } public String toNormalform(final boolean excludeReference, final boolean stripAmp) { return toNormalform(excludeReference, stripAmp, false); @@ -1105,6 +1181,11 @@ public class MultiProtocolURI implements Serializable, Comparable implements Iterator> { E last; public ReverseMapIterator(Map map) { - this.map = map; - this.a = new ArrayList(); - for (E e: map.keySet()) a.add(e); + synchronized (map) { + this.map = map; + this.a = new ArrayList(); + while (true) { + try { + for (E e: map.keySet()) { + a.add(e); + } + break; + } catch (ConcurrentModificationException e) { + continue; + } + } + } } public boolean hasNext() {