You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/crawler/retrieval/URLRewriterLibrary.java

104 lines
4.1 KiB

/**
* URLRewriterLibrary
* SPDX-FileCopyrightText: 2012 Michael Peter Christen <mc@yacy.net)>
* SPDX-License-Identifier: GPL-2.0-or-later
* Frankfurt a. M., Germany
* first published 08.10.2012 on http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler.retrieval;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.storage.Files;
import net.yacy.cora.util.ConcurrentLog;
public class URLRewriterLibrary {
private final static ConcurrentLog log = new ConcurrentLog(URLRewriterLibrary.class.getName());
private final File rewritingPath;
private final Map<Pattern, String> rewriters;
public URLRewriterLibrary(final File rewritingPath) {
this.rewriters = new HashMap<Pattern, String>();
this.rewritingPath = rewritingPath;
if (this.rewritingPath == null || !this.rewritingPath.exists()) {
return;
}
final String[] files = this.rewritingPath.list();
for (final String f: files) {
File ff = new File(this.rewritingPath, f);
try {
BlockingQueue<String> list = Files.concurentLineReader(ff);
String line;
while ((line = list.take()) != Files.POISON_LINE) {
line = line.trim();
if (line.length() == 0 || line.charAt(0) == '#') continue;
if (!line.startsWith("s/")) {
int p = line.indexOf('=');
if (p < 0) p = line.indexOf(':');
if (p > 0) try {
this.rewriters.put(Pattern.compile(line.substring(0, p)), line.substring(p + 1));
} catch (final PatternSyntaxException e) {
log.warn("bad pattern: " + line.substring(0, p));
}
}
}
} catch (final Throwable e) {
log.warn("cannot read stemming file " + f, e);
}
}
}
public URLRewriterLibrary() {
this.rewriters = new HashMap<Pattern, String>();
this.rewritingPath = null;
}
public String apply(String s) {
if (this.rewriters == null || this.rewriters.size() == 0) return s;
for (Map.Entry<Pattern, String> entry: this.rewriters.entrySet()) {
Matcher m = entry.getKey().matcher(s);
if (m.matches()) s = m.replaceAll(entry.getValue());
}
return s;
}
public static void main(String[] args) {
URLRewriterLibrary lib = new URLRewriterLibrary();
lib.rewriters.put(Pattern.compile("cln_\\d+\\/"), ""); // www.bund.de
lib.rewriters.put(Pattern.compile("&amp;administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/
lib.rewriters.put(Pattern.compile("\\?administration=[0-9a-z]*"), ""); // http://www.lichtenau.de/
lib.rewriters.put(Pattern.compile("\\(X\\([1]\\"), ""); // herzogenrath
lib.rewriters.put(Pattern.compile("\\(S\\([0-9a-z]+\\)\\)\\/"), ""); // herzogenrath
lib.rewriters.put(Pattern.compile("&amp;ccm=[0-9]*"), ""); // herne
lib.rewriters.put(Pattern.compile("&sid=[0-9]{14}.{8}"), ""); // startercenter
String s = "";
Pattern p = Pattern.compile("a");
s = p.matcher(s).replaceAll("b");
}
}