- added CamelCase parser to MultiProtocolURI: generate better to-be-indexed words from urls

- integrated new parser into loader processes: enrich document parser
- fixed a concurrent modification exception in kelondro iterator
- hand-over of document size from crawler to indexer

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7374 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 358feeeb39
commit 56264dcc17

@ -312,8 +312,8 @@ public final class CrawlStacker {
if (protocol.equals("http") || protocol.equals("https")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
if (protocol.equals("ftp")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.ftp.maxFileSize", FTPLoader.DEFAULT_MAXFILESIZE);
if (protocol.equals("smb")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.smb.maxFileSize", SMBLoader.DEFAULT_MAXFILESIZE);
}
// check availability of parser and maxfilesize
if (entry.size() > maxFileSize ||
(entry.url().getFileExtension().length() > 0 && TextParser.supports(entry.url(), null) != null)

@ -252,7 +252,7 @@ public class FTPLoader {
responseHeader,
"200",
mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes());
url.toTokens().getBytes());
return response;
}

@ -133,7 +133,7 @@ public class FileLoader {
responseHeader,
"200",
mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes());
url.toTokens().getBytes());
return response;
}

@ -165,10 +165,11 @@ public class Response {
// request and response headers may be zero in case that we process surrogates
this.requestHeader = new RequestHeader();
this.responseHeader = new ResponseHeader();
if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
this.responseStatus = "200";
this.profile = profile;
this.status = QUEUE_STATE_FRESH;
this.content = request.url().toNormalform(true, true).getBytes();
this.content = request.url().toTokens().getBytes();
}
public Response(

@ -154,7 +154,7 @@ public class SMBLoader {
responseHeader,
"200",
mp == null ? null : new CrawlProfile(mp),
url.toNormalform(true, true).getBytes());
url.toTokens().getBytes());
return response;
}

@ -1833,7 +1833,7 @@ public final class Switchboard extends serverSwitch {
doclist.add(document);
}
if (doclist.isEmpty()) return new indexingQueueEntry(in.process, in.queueEntry, in.documents, null);
if (doclist.isEmpty()) return new indexingQueueEntry(in.process, in.queueEntry, in.documents, null);
in.documents = doclist.toArray(new Document[doclist.size()]);
Condenser[] condenser = new Condenser[in.documents.length];
if (this.log.isFine()) log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'");

@ -30,7 +30,9 @@ import java.io.InputStream;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.text.Collator;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
@ -770,6 +772,80 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
public String toString() {
return toNormalform(false, true);
}
public String toTokens() {
return toTokens(this.toNormalform(true, true));
}
private final static String[] replacementStrings = {"%20", "%2B", "%2b"};
/**
* create word tokens for parser. Find CamelCases and separate these words
* resulting words are not ordered by appearance, but all
* @return
*/
public static String toTokens(String s) {
String t = new String(s);
// remove all replacement strings
for (String r: replacementStrings) t = t.replaceAll(r, " ");
// remove all non-character & non-number
StringBuilder sb = new StringBuilder(t.length());
char c;
for (int i = 0; i < t.length(); i++) {
c = t.charAt(i);
if ((c >= '0' && c <='9') || (c >= 'a' && c <='z') || (c >= 'A' && c <='Z')) sb.append(c); else sb.append(' ');
}
t = sb.toString();
// remove all double-spaces
int p;
while ((p = t.indexOf(" ")) >= 0) t = t.substring(0, p) + t.substring(p + 1);
// split the string into tokens and add all camel-case splitting
String[] u = t.split(" ");
Map<String, Object> token = new LinkedHashMap<String, Object>();
for (String r: u) {
token.putAll(parseCamelCase(r));
}
// construct a String again
for (String v: token.keySet()) if (v.length() > 1) s += " " + v;
return s;
}
public static enum CharType { low, high, number; }
public static Map<String, Object> parseCamelCase(String s) {
Map<String, Object> token = new LinkedHashMap<String, Object>();
if (s.length() == 0) return token;
int p = 0;
CharType type = charType(s.charAt(0)), nct = type;
while (p < s.length()) {
// search for first appearance of an character that is a upper-case
while (p < s.length() && (nct = charType(s.charAt(p))) == type) p++;
if (p >= s.length()) { token.put(s, new Object()); break; }
if (nct == CharType.low) {
type = CharType.low;
p++; continue;
}
// the char type has changed
token.put(s.substring(0, p), new Object());
s = s.substring(p);
p = 0;
type = nct;
}
token.put(s, new Object());
return token;
}
private static CharType charType(char c) {
if (c >= 'a' && c <= 'z') return CharType.low;
if (c >= '0' && c <= '1') return CharType.number;
return CharType.high;
}
public String toNormalform(final boolean excludeReference, final boolean stripAmp) {
return toNormalform(excludeReference, stripAmp, false);
@ -1105,6 +1181,11 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return splitpattern.split(normalizedURL.toLowerCase()); // word components of the url
}
public static void main(final String[] args) {
for (String s: args) System.out.println(toTokens(s));
}
/*
public static void main(final String[] args) {
final String[][] test = new String[][]{
new String[]{null, "C:WINDOWS\\CMD0.EXE"},
@ -1191,5 +1272,6 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
}
}
}
*/
}

@ -252,7 +252,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String f = url.getFile();
final int p = f.lastIndexOf('.');
final String type = (p < 0) ? "" : f.substring(p + 1);
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg")) {
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) {
// special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
addImage(images, ie);

@ -27,6 +27,7 @@
package net.yacy.kelondro.util;
import java.util.ArrayList;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.Map;
@ -36,9 +37,20 @@ public class ReverseMapIterator <E, F> implements Iterator<Map.Entry<E, F>> {
E last;
public ReverseMapIterator(Map<E, F> map) {
this.map = map;
this.a = new ArrayList<E>();
for (E e: map.keySet()) a.add(e);
synchronized (map) {
this.map = map;
this.a = new ArrayList<E>();
while (true) {
try {
for (E e: map.keySet()) {
a.add(e);
}
break;
} catch (ConcurrentModificationException e) {
continue;
}
}
}
}
public boolean hasNext() {

Loading…
Cancel
Save