diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 6ef0b6262..acc55e9d1 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -47,6 +47,8 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.analysis.Classification; @@ -66,7 +68,7 @@ import net.yacy.kelondro.util.FileUtils; public class Document { - private final DigestURL source; // the source url + private DigestURL source; // the source url private final String mimeType; // mimeType as taken from http header private final String charset; // the charset of the document private final List keywords; // most resources provide a keyword field @@ -321,6 +323,24 @@ dc_rights return this.source; } + /** + * rewrite the dc_source; this can be used for normalization purpose + * @param pattern + * @param replacement + */ + public void rewrite_dc_source(Pattern pattern, String replacement) { + String u = this.source.toNormalform(false); + Matcher m = pattern.matcher(u); + if (m.matches()) { + u = m.replaceAll(replacement); + try { + DigestURL du = new DigestURL(u); + this.source = du; + } catch (MalformedURLException e) { + } + } + } + /** * @return the supposed charset of this document or null if unknown */ diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 1cfe10298..c22083579 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2477,6 +2477,7 @@ public final class Switchboard extends serverSwitch { private Document[] parseDocument(final Response response) throws InterruptedException { Document[] documents = null; + //final Pattern rewritePattern = Pattern.compile(";jsessionid.*"); final EventOrigin processCase = response.processCase(this.peers.mySeed().hash); if ( this.log.isFine() ) { @@ -2530,6 +2531,7 @@ public final class Switchboard extends serverSwitch { if (response.profile() != null) { ArrayList newDocs = new ArrayList(); for (Document doc: documents) { + //doc.rewrite_dc_source(rewritePattern, ""); String rejectReason = this.crawlStacker.checkAcceptanceChangeable(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/); if (rejectReason == null) { newDocs.add(doc); @@ -2560,7 +2562,6 @@ public final class Switchboard extends serverSwitch { if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue()); } - // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links if (response.profile().directDocByURL()) { for (Map.Entry entry: Document.getImagelinks(documents).entrySet()) { @@ -2593,6 +2594,8 @@ public final class Switchboard extends serverSwitch { log.info("REWRITE of url = \"" + u + "\" to \"" + u0 + "\""); u = u0; } + //Matcher m = rewritePattern.matcher(u); + //if (m.matches()) u = m.replaceAll(""); // enqueue the hyperlink into the pre-notice-url db try {