You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102 lines
4.1 KiB
102 lines
4.1 KiB
/**
|
|
* URLRewriterLibrary
|
|
* Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
* first published 08.10.2012 on http://yacy.net
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package net.yacy.crawler.retrieval;
|
|
|
|
import java.io.File;
|
|
import java.util.HashMap;
|
|
import java.util.Map;
|
|
import java.util.concurrent.BlockingQueue;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
import java.util.regex.PatternSyntaxException;
|
|
|
|
import net.yacy.cora.storage.Files;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
|
|
|
|
public class URLRewriterLibrary {
|
|
|
|
private final static ConcurrentLog log = new ConcurrentLog(URLRewriterLibrary.class.getName());
|
|
|
|
private final File rewritingPath;
|
|
private final Map<Pattern, String> rewriters;
|
|
|
|
public URLRewriterLibrary(final File rewritingPath) {
|
|
this.rewriters = new HashMap<Pattern, String>();
|
|
this.rewritingPath = rewritingPath;
|
|
if (this.rewritingPath == null || !this.rewritingPath.exists()) {
|
|
return;
|
|
}
|
|
final String[] files = this.rewritingPath.list();
|
|
for (final String f: files) {
|
|
File ff = new File(this.rewritingPath, f);
|
|
try {
|
|
BlockingQueue<String> list = Files.concurentLineReader(ff, 1000);
|
|
String line;
|
|
while ((line = list.take()) != Files.POISON_LINE) {
|
|
line = line.trim();
|
|
if (line.length() == 0 || line.charAt(0) == '#') continue;
|
|
if (!line.startsWith("s/")) {
|
|
int p = line.indexOf('=');
|
|
if (p < 0) p = line.indexOf(':');
|
|
if (p > 0) try {
|
|
this.rewriters.put(Pattern.compile(line.substring(0, p)), line.substring(p + 1));
|
|
} catch (final PatternSyntaxException e) {
|
|
log.warn("bad pattern: " + line.substring(0, p));
|
|
}
|
|
}
|
|
}
|
|
} catch (final Throwable e) {
|
|
log.warn("cannot read stemming file " + f, e);
|
|
}
|
|
}
|
|
}
|
|
|
|
public URLRewriterLibrary() {
|
|
this.rewriters = new HashMap<Pattern, String>();
|
|
this.rewritingPath = null;
|
|
}
|
|
|
|
public String apply(String s) {
|
|
if (this.rewriters == null || this.rewriters.size() == 0) return s;
|
|
for (Map.Entry<Pattern, String> entry: this.rewriters.entrySet()) {
|
|
Matcher m = entry.getKey().matcher(s);
|
|
if (m.matches()) s = m.replaceAll(entry.getValue());
|
|
}
|
|
return s;
|
|
}
|
|
|
|
public static void main(String[] args) {
|
|
URLRewriterLibrary lib = new URLRewriterLibrary();
|
|
lib.rewriters.put(Pattern.compile("cln_\\d+\\/"), ""); // www.bund.de
|
|
lib.rewriters.put(Pattern.compile("&administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/
|
|
lib.rewriters.put(Pattern.compile("\\?administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/
|
|
lib.rewriters.put(Pattern.compile("\\(X\\([1]\\"), ""); // herzogenrath
|
|
lib.rewriters.put(Pattern.compile("\\(S\\([0-9a-z]+\\)\\)\\/"), ""); // herzogenrath
|
|
lib.rewriters.put(Pattern.compile("&ccm=[0-9]*"), ""); // herne
|
|
lib.rewriters.put(Pattern.compile("&sid=[0-9]{14}.{8}"), ""); // startercenter
|
|
String s = "";
|
|
Pattern p = Pattern.compile("a");
|
|
s = p.matcher(s).replaceAll("b");
|
|
}
|
|
|
|
}
|