added an url rewriter which can be used to remove session ids from urls

pull/1/head
Michael Peter Christen 12 years ago
parent 9190599d21
commit 4b5e0c1500

@ -0,0 +1,102 @@
/**
* URLRewriterLibrary
* Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 08.10.2012 on http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler.retrieval;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.storage.Files;
import org.apache.log4j.Logger;
public class URLRewriterLibrary {
private final static Logger log = Logger.getLogger(URLRewriterLibrary.class);
private final File rewritingPath;
private final Map<Pattern, String> rewriters;
public URLRewriterLibrary(final File rewritingPath) {
this.rewriters = new HashMap<Pattern, String>();
this.rewritingPath = rewritingPath;
if (this.rewritingPath == null || !this.rewritingPath.exists()) {
return;
}
final String[] files = this.rewritingPath.list();
for (final String f: files) {
File ff = new File(this.rewritingPath, f);
try {
BlockingQueue<String> list = Files.concurentLineReader(ff, 1000);
String line;
while ((line = list.take()) != Files.POISON_LINE) {
line = line.trim();
if (line.length() == 0 || line.charAt(0) == '#') continue;
if (!line.startsWith("s/")) {
int p = line.indexOf('=');
if (p < 0) p = line.indexOf(':');
if (p > 0) try {
this.rewriters.put(Pattern.compile(line.substring(0, p)), line.substring(p + 1));
} catch (PatternSyntaxException e) {
log.warn("bad pattern: " + line.substring(0, p));
}
}
}
} catch (Throwable e) {
log.warn("cannot read stemming file " + f, e);
}
}
}
public URLRewriterLibrary() {
this.rewriters = new HashMap<Pattern, String>();
this.rewritingPath = null;
}
public String apply(String s) {
if (this.rewriters == null || this.rewriters.size() == 0) return s;
for (Map.Entry<Pattern, String> entry: this.rewriters.entrySet()) {
Matcher m = entry.getKey().matcher(s);
if (m.matches()) s = m.replaceAll(entry.getValue());
}
return s;
}
public static void main(String[] args) {
URLRewriterLibrary lib = new URLRewriterLibrary();
lib.rewriters.put(Pattern.compile("cln_\\d+\\/"), ""); // www.bund.de
lib.rewriters.put(Pattern.compile("&amp;administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/
lib.rewriters.put(Pattern.compile("\\?administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/
lib.rewriters.put(Pattern.compile("\\(X\\([1]\\"), ""); // herzogenrath
lib.rewriters.put(Pattern.compile("\\(S\\([0-9a-z]+\\)\\)\\/"), ""); // herzogenrath
lib.rewriters.put(Pattern.compile("&amp;ccm=[0-9]*"), ""); // herne
lib.rewriters.put(Pattern.compile("&sid=[0-9]{14}.{8}"), ""); // startercenter
String s = "";
Pattern p = Pattern.compile("a");
s = p.matcher(s).replaceAll("b");
}
}

@ -53,6 +53,7 @@ import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
import net.yacy.cora.storage.Files;
import net.yacy.crawler.retrieval.URLRewriterLibrary;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -64,12 +65,14 @@ public class LibraryProvider {
public static final String path_to_did_you_mean_dictionaries = "didyoumean";
public static final String path_to_autotagging_dictionaries = "autotagging";
public static final String path_to_synonym_dictionaries = "synonyms";
public static final String path_to_rewriter_dictionaries = "rewriter";
public static final String disabledExtension = ".disabled";
public static WordCache dymLib = new WordCache(null);
public static AutotaggingLibrary autotagging = null;
public static SynonymLibrary synonyms = null;
public static URLRewriterLibrary urlRewriter = null;
public static OverarchingLocation geoLoc = new OverarchingLocation();
private static File dictSource = null;
private static File dictRoot = null;
@ -124,6 +127,7 @@ public class LibraryProvider {
activateDeReWo();
initDidYouMean();
initSynonyms();
initRewriter();
integrateOpenGeoDB();
integrateGeonames0(-1);
integrateGeonames1(-1);
@ -195,6 +199,13 @@ public class LibraryProvider {
}
synonyms = new SynonymLibrary(synonymPath);
}
public static void initRewriter() {
final File rewriterPath = new File(dictRoot, path_to_rewriter_dictionaries);
if ( !rewriterPath.exists() ) {
rewriterPath.mkdirs();
}
urlRewriter = new URLRewriterLibrary(rewriterPath);
}
public static void activateDeReWo() {
// translate input files (once..)
final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries);

@ -2410,13 +2410,21 @@ public final class Switchboard extends serverSwitch
// process the next hyperlink
nextUrl = nextEntry.getKey();
final String u = nextUrl.toNormalform(true, true, true);
String u = nextUrl.toNormalform(true, true, true);
if ( !(u.startsWith("http://")
|| u.startsWith("https://")
|| u.startsWith("ftp://")
|| u.startsWith("smb://") || u.startsWith("file://")) ) {
continue;
}
// rewrite the url
String u0 = LibraryProvider.urlRewriter.apply(u);
if (!u.equals(u0)) {
log.logInfo("REWRITE of url = \"" + u + "\" to \"" + u0 + "\"");
u = u0;
}
// enqueue the hyperlink into the pre-notice-url db
try {
this.crawlStacker.enqueueEntry(new Request(

Loading…
Cancel
Save