added (partly commented) test code for url rewrite methods .. to be

completed
pull/1/head
orbiter 11 years ago
parent 74c86a72a0
commit 909bbb49d8

@ -47,6 +47,8 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.analysis.Classification;
@ -66,7 +68,7 @@ import net.yacy.kelondro.util.FileUtils;
public class Document {
private final DigestURL source; // the source url
private DigestURL source; // the source url
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field
@ -321,6 +323,24 @@ dc_rights
return this.source;
}
/**
* rewrite the dc_source; this can be used for normalization purpose
* @param pattern
* @param replacement
*/
public void rewrite_dc_source(Pattern pattern, String replacement) {
String u = this.source.toNormalform(false);
Matcher m = pattern.matcher(u);
if (m.matches()) {
u = m.replaceAll(replacement);
try {
DigestURL du = new DigestURL(u);
this.source = du;
} catch (MalformedURLException e) {
}
}
}
/**
* @return the supposed charset of this document or <code>null</code> if unknown
*/

@ -2477,6 +2477,7 @@ public final class Switchboard extends serverSwitch {
private Document[] parseDocument(final Response response) throws InterruptedException {
Document[] documents = null;
//final Pattern rewritePattern = Pattern.compile(";jsessionid.*");
final EventOrigin processCase = response.processCase(this.peers.mySeed().hash);
if ( this.log.isFine() ) {
@ -2530,6 +2531,7 @@ public final class Switchboard extends serverSwitch {
if (response.profile() != null) {
ArrayList<Document> newDocs = new ArrayList<Document>();
for (Document doc: documents) {
//doc.rewrite_dc_source(rewritePattern, "");
String rejectReason = this.crawlStacker.checkAcceptanceChangeable(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
if (rejectReason == null) {
newDocs.add(doc);
@ -2560,7 +2562,6 @@ public final class Switchboard extends serverSwitch {
if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue());
}
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) {
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
@ -2593,6 +2594,8 @@ public final class Switchboard extends serverSwitch {
log.info("REWRITE of url = \"" + u + "\" to \"" + u0 + "\"");
u = u0;
}
//Matcher m = rewritePattern.matcher(u);
//if (m.matches()) u = m.replaceAll("");
// enqueue the hyperlink into the pre-notice-url db
try {

Loading…
Cancel
Save