url parser regex performance hack

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6524 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 2126dffea2
commit a37878b7d5

@ -641,7 +641,7 @@ public final class RankingProcess extends Thread {
// take out relevant information for reference computation
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
final String[] descrcomps = resultEntry.title().toLowerCase().split(DigestURI.splitrex); // words in the description
final String[] descrcomps = DigestURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description
// add references
//addTopic(urlcomps);

@ -373,7 +373,7 @@ public class ResultFetcher {
// apply 'common-sense' heuristic using references
final String urlstring = rentry.url().toNormalform(true, true);
final String[] urlcomps = DigestURI.urlComps(urlstring);
final String[] descrcomps = rentry.title().toLowerCase().split(DigestURI.splitrex);
final String[] descrcomps = DigestURI.splitpattern.split(rentry.title().toLowerCase());
Integer tc;
for (int j = 0; j < urlcomps.length; j++) {
tc = topwords.get(urlcomps[j]);

@ -616,7 +616,7 @@ public final class yacyClient {
final String references = result.get("references");
yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references);
if (references != null) {
// add references twice, so they can be countet (must have at least 2 entries)
// add references twice, so they can be counted (must have at least 2 entries)
containerCache.addTopic(references.split(","));
containerCache.addTopic(references.split(","));
}

@ -402,7 +402,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (s == null) s = metas.get("dc.description");
if (s == null) s = "";
if (s.length() == 0) {
return getTitle().toLowerCase().split(DigestURI.splitrex);
return DigestURI.splitpattern.split(getTitle().toLowerCase());
}
if (s.contains(",")) return s.split(" |,");
if (s.contains(";")) return s.split(" |;");

@ -938,11 +938,12 @@ public class DigestURI implements Serializable {
return language;
}
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
private static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
public static final Pattern splitpattern = Pattern.compile(splitrex);
public static String[] urlComps(String normalizedURL) {
final int p = normalizedURL.indexOf("//");
if (p > 0) normalizedURL = normalizedURL.substring(p + 2);
return normalizedURL.toLowerCase().split(splitrex); // word components of the url
return splitpattern.split(normalizedURL.toLowerCase()); // word components of the url
}
public static void main(final String[] args) {

Loading…
Cancel
Save