From 4b5e0c1500a40f160fa426aa5e0b60c5f6f7cfb2 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 9 Oct 2012 11:24:48 +0200 Subject: [PATCH] added an url rewriter which can be used to remove session ids from urls --- .../crawler/retrieval/URLRewriterLibrary.java | 102 ++++++++++++++++++ source/net/yacy/document/LibraryProvider.java | 11 ++ source/net/yacy/search/Switchboard.java | 10 +- 3 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 source/net/yacy/crawler/retrieval/URLRewriterLibrary.java diff --git a/source/net/yacy/crawler/retrieval/URLRewriterLibrary.java b/source/net/yacy/crawler/retrieval/URLRewriterLibrary.java new file mode 100644 index 000000000..158161d3b --- /dev/null +++ b/source/net/yacy/crawler/retrieval/URLRewriterLibrary.java @@ -0,0 +1,102 @@ +/** + * URLRewriterLibrary + * Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * first published 08.10.2012 on http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.crawler.retrieval; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import net.yacy.cora.storage.Files; + +import org.apache.log4j.Logger; + + +public class URLRewriterLibrary { + + private final static Logger log = Logger.getLogger(URLRewriterLibrary.class); + + private final File rewritingPath; + private final Map rewriters; + + public URLRewriterLibrary(final File rewritingPath) { + this.rewriters = new HashMap(); + this.rewritingPath = rewritingPath; + if (this.rewritingPath == null || !this.rewritingPath.exists()) { + return; + } + final String[] files = this.rewritingPath.list(); + for (final String f: files) { + File ff = new File(this.rewritingPath, f); + try { + BlockingQueue list = Files.concurentLineReader(ff, 1000); + String line; + while ((line = list.take()) != Files.POISON_LINE) { + line = line.trim(); + if (line.length() == 0 || line.charAt(0) == '#') continue; + if (!line.startsWith("s/")) { + int p = line.indexOf('='); + if (p < 0) p = line.indexOf(':'); + if (p > 0) try { + this.rewriters.put(Pattern.compile(line.substring(0, p)), line.substring(p + 1)); + } catch (PatternSyntaxException e) { + log.warn("bad pattern: " + line.substring(0, p)); + } + } + } + } catch (Throwable e) { + log.warn("cannot read stemming file " + f, e); + } + } + } + + public URLRewriterLibrary() { + this.rewriters = new HashMap(); + this.rewritingPath = null; + } + + public String apply(String s) { + if (this.rewriters == null || this.rewriters.size() == 0) return s; + for (Map.Entry entry: this.rewriters.entrySet()) { + Matcher m = entry.getKey().matcher(s); + if (m.matches()) s = m.replaceAll(entry.getValue()); + } + return s; + } + + public static void main(String[] args) { + URLRewriterLibrary lib = new URLRewriterLibrary(); + lib.rewriters.put(Pattern.compile("cln_\\d+\\/"), ""); // www.bund.de + lib.rewriters.put(Pattern.compile("&administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/ + lib.rewriters.put(Pattern.compile("\\?administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/ + lib.rewriters.put(Pattern.compile("\\(X\\([1]\\"), ""); // herzogenrath + lib.rewriters.put(Pattern.compile("\\(S\\([0-9a-z]+\\)\\)\\/"), ""); // herzogenrath + lib.rewriters.put(Pattern.compile("&ccm=[0-9]*"), ""); // herne + lib.rewriters.put(Pattern.compile("&sid=[0-9]{14}.{8}"), ""); // startercenter + String s = ""; + Pattern p = Pattern.compile("a"); + s = p.matcher(s).replaceAll("b"); + } + +} diff --git a/source/net/yacy/document/LibraryProvider.java b/source/net/yacy/document/LibraryProvider.java index bf258d469..8b31a6363 100644 --- a/source/net/yacy/document/LibraryProvider.java +++ b/source/net/yacy/document/LibraryProvider.java @@ -53,6 +53,7 @@ import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging.SOTuple; import net.yacy.cora.storage.Files; +import net.yacy.crawler.retrieval.URLRewriterLibrary; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -64,12 +65,14 @@ public class LibraryProvider { public static final String path_to_did_you_mean_dictionaries = "didyoumean"; public static final String path_to_autotagging_dictionaries = "autotagging"; public static final String path_to_synonym_dictionaries = "synonyms"; + public static final String path_to_rewriter_dictionaries = "rewriter"; public static final String disabledExtension = ".disabled"; public static WordCache dymLib = new WordCache(null); public static AutotaggingLibrary autotagging = null; public static SynonymLibrary synonyms = null; + public static URLRewriterLibrary urlRewriter = null; public static OverarchingLocation geoLoc = new OverarchingLocation(); private static File dictSource = null; private static File dictRoot = null; @@ -124,6 +127,7 @@ public class LibraryProvider { activateDeReWo(); initDidYouMean(); initSynonyms(); + initRewriter(); integrateOpenGeoDB(); integrateGeonames0(-1); integrateGeonames1(-1); @@ -195,6 +199,13 @@ public class LibraryProvider { } synonyms = new SynonymLibrary(synonymPath); } + public static void initRewriter() { + final File rewriterPath = new File(dictRoot, path_to_rewriter_dictionaries); + if ( !rewriterPath.exists() ) { + rewriterPath.mkdirs(); + } + urlRewriter = new URLRewriterLibrary(rewriterPath); + } public static void activateDeReWo() { // translate input files (once..) final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index cf499ffd3..5515e8fd3 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2410,13 +2410,21 @@ public final class Switchboard extends serverSwitch // process the next hyperlink nextUrl = nextEntry.getKey(); - final String u = nextUrl.toNormalform(true, true, true); + String u = nextUrl.toNormalform(true, true, true); if ( !(u.startsWith("http://") || u.startsWith("https://") || u.startsWith("ftp://") || u.startsWith("smb://") || u.startsWith("file://")) ) { continue; } + + // rewrite the url + String u0 = LibraryProvider.urlRewriter.apply(u); + if (!u.equals(u0)) { + log.logInfo("REWRITE of url = \"" + u + "\" to \"" + u0 + "\""); + u = u0; + } + // enqueue the hyperlink into the pre-notice-url db try { this.crawlStacker.enqueueEntry(new Request(