parent
9190599d21
commit
4b5e0c1500
@ -0,0 +1,102 @@
|
||||
/**
|
||||
* URLRewriterLibrary
|
||||
* Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
* first published 08.10.2012 on http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.crawler.retrieval;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import net.yacy.cora.storage.Files;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
|
||||
public class URLRewriterLibrary {
|
||||
|
||||
private final static Logger log = Logger.getLogger(URLRewriterLibrary.class);
|
||||
|
||||
private final File rewritingPath;
|
||||
private final Map<Pattern, String> rewriters;
|
||||
|
||||
public URLRewriterLibrary(final File rewritingPath) {
|
||||
this.rewriters = new HashMap<Pattern, String>();
|
||||
this.rewritingPath = rewritingPath;
|
||||
if (this.rewritingPath == null || !this.rewritingPath.exists()) {
|
||||
return;
|
||||
}
|
||||
final String[] files = this.rewritingPath.list();
|
||||
for (final String f: files) {
|
||||
File ff = new File(this.rewritingPath, f);
|
||||
try {
|
||||
BlockingQueue<String> list = Files.concurentLineReader(ff, 1000);
|
||||
String line;
|
||||
while ((line = list.take()) != Files.POISON_LINE) {
|
||||
line = line.trim();
|
||||
if (line.length() == 0 || line.charAt(0) == '#') continue;
|
||||
if (!line.startsWith("s/")) {
|
||||
int p = line.indexOf('=');
|
||||
if (p < 0) p = line.indexOf(':');
|
||||
if (p > 0) try {
|
||||
this.rewriters.put(Pattern.compile(line.substring(0, p)), line.substring(p + 1));
|
||||
} catch (PatternSyntaxException e) {
|
||||
log.warn("bad pattern: " + line.substring(0, p));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
log.warn("cannot read stemming file " + f, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public URLRewriterLibrary() {
|
||||
this.rewriters = new HashMap<Pattern, String>();
|
||||
this.rewritingPath = null;
|
||||
}
|
||||
|
||||
public String apply(String s) {
|
||||
if (this.rewriters == null || this.rewriters.size() == 0) return s;
|
||||
for (Map.Entry<Pattern, String> entry: this.rewriters.entrySet()) {
|
||||
Matcher m = entry.getKey().matcher(s);
|
||||
if (m.matches()) s = m.replaceAll(entry.getValue());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
URLRewriterLibrary lib = new URLRewriterLibrary();
|
||||
lib.rewriters.put(Pattern.compile("cln_\\d+\\/"), ""); // www.bund.de
|
||||
lib.rewriters.put(Pattern.compile("&administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/
|
||||
lib.rewriters.put(Pattern.compile("\\?administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/
|
||||
lib.rewriters.put(Pattern.compile("\\(X\\([1]\\"), ""); // herzogenrath
|
||||
lib.rewriters.put(Pattern.compile("\\(S\\([0-9a-z]+\\)\\)\\/"), ""); // herzogenrath
|
||||
lib.rewriters.put(Pattern.compile("&ccm=[0-9]*"), ""); // herne
|
||||
lib.rewriters.put(Pattern.compile("&sid=[0-9]{14}.{8}"), ""); // startercenter
|
||||
String s = "";
|
||||
Pattern p = Pattern.compile("a");
|
||||
s = p.matcher(s).replaceAll("b");
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue