From dff4f95c788f84749f701a961f4b3092dd25c43e Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 7 Jan 2010 00:42:12 +0000 Subject: [PATCH] some patches to get the torrent parser working git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6551 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 2 +- .../de/anomic/crawler/retrieval/Response.java | 10 +++--- source/de/anomic/search/Switchboard.java | 2 +- source/net/yacy/ai/example/testorder.java | 36 +++++++++++++++++++ source/net/yacy/document/Document.java | 7 ++-- .../yacy/document/parser/torrentParser.java | 28 ++++++++++----- 6 files changed, 66 insertions(+), 19 deletions(-) create mode 100755 source/net/yacy/ai/example/testorder.java diff --git a/defaults/yacy.init b/defaults/yacy.init index b8373d8c2..7b8326b45 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -127,7 +127,7 @@ update.onlySignedFiles = 1 # a peer can be re-started periodically # restart.process can be either 'off' (no automatic restart) or 'time' (time- rule-based, see below) restart.process = off -# the restart.cycle is the number of hours that must pass bevore a restart is done +# the restart.cycle is the number of hours that must pass before a restart is done restart.cycle = 20 # the restart.hour is a pattern that must match with the hour string (two-digit, 24h) # when the restart should be performed diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index bde4ea013..c7ba1e739 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -654,7 +654,7 @@ public class Response { // check profile if (!profile().indexText() && !profile().indexMedia()) { - return "indexing not allowed - indexText and indexMedia not set (for crawler = " + profile.name()+ ")"; + return "indexing not allowed - indexText and indexMedia not set (for crawler = " + profile.name() + ")"; } // -CGI access in request @@ -670,17 +670,19 @@ public class Response { // -ranges in request // we checked that in shallStoreCache - // check if pictures can be indexed + // check if document can be indexed if (responseHeader != null) { final String mimeType = responseHeader.mime(); String parserError = TextParser.supportsMime(mimeType); - if (parserError != null) { return "Media_Content, parser error: " + parserError; } + if (parserError != null && TextParser.supportsExtension(url()) != null) return "no parser available: " + parserError; } + /* if (Classification.isMediaExtension(url().getFileExtension()) && !Classification.isImageExtension((url().getFileExtension()))) { return "Media_Content_(forbidden)"; } - + */ + // -if-modified-since in request // if the page is fresh at the very moment we can index it // -> this does not apply for the crawler diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index a35289dd2..88084e80a 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1200,7 +1200,7 @@ public final class Switchboard extends serverSwitch { if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason); addURLtoErrorDB(response.url(), (referrerURL == null) ? "" : referrerURL.hash(), response.initiator(), response.name(), noIndexReason); // finish this entry - return "not indexed any word in URL " + response.url() + "; cause: " + noIndexReason; + return "not allowed: " + noIndexReason; } // put document into the concurrent processing queue diff --git a/source/net/yacy/ai/example/testorder.java b/source/net/yacy/ai/example/testorder.java new file mode 100755 index 000000000..cd3cc0b81 --- /dev/null +++ b/source/net/yacy/ai/example/testorder.java @@ -0,0 +1,36 @@ +package net.yacy.ai.example; + +import java.util.Random; +import java.util.concurrent.PriorityBlockingQueue; + +public class testorder implements Comparable { + + public int x; + public testorder(int x) { + this.x = x; + } + public String toString() { + return Integer.toString(this.x); + } + + public int compareTo(testorder o) { + if (this.x > o.x) return 1; + if (this.x < o.x) return -1; + return 0; + } + + public static void main(String[] args) { + PriorityBlockingQueue q = new PriorityBlockingQueue(); + Random r = new Random(); + for (int i = 0; i < 10; i++) { + q.add(new testorder(r.nextInt(20))); + } + while (!q.isEmpty()) + try { + System.out.println(q.take().toString()); + } catch (InterruptedException e) { + + e.printStackTrace(); + } + } +} diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 8444d6765..932c8eeab 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -391,13 +391,10 @@ dc_rights else if (Classification.isAudioExtension(ext)) audiolinks.put(url, entry.getValue()); else if (Classification.isVideoExtension(ext)) videolinks.put(url, entry.getValue()); else if (Classification.isApplicationExtension(ext)) applinks.put(url, entry.getValue()); - } else { - hyperlinks.put(url, entry.getValue()); } - } else { - // a path to a directory - hyperlinks.put(url, entry.getValue()); } + // in any case we consider this as a link and let the parser decide if that link can be followed + hyperlinks.put(url, entry.getValue()); } } diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index cce24c27e..d494972e2 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -87,16 +87,28 @@ public class torrentParser extends AbstractParser implements Idiom { if (bo == null) throw new ParserException("BDecoder.parse returned null", location); if (bo.getType() != BType.dictionary) throw new ParserException("BDecoder object is not a dictionary", location); Map map = bo.getMap(); - String comment = map.get("comment").getString(); + BObject commento = map.get("comment"); + String comment = (commento == null) ? "" : commento.getString(); //Date creation = new Date(map.get("creation date").getInteger()); - Map info = map.get("info").getMap(); - List filelist = info.get("files").getList(); - StringBuilder filenames = new StringBuilder(40 * filelist.size()); - for (BObject fo: filelist) { - List l = fo.getMap().get("path").getList(); // one file may have several names - for (BObject fl: l) filenames.append(fl.toString()).append(" "); + BObject infoo = map.get("info"); + StringBuilder filenames = new StringBuilder(); + String name = ""; + if (infoo != null) { + Map info = infoo.getMap(); + BObject fileso = info.get("files"); + if (fileso != null) { + List filelist = fileso.getList(); + for (BObject fo: filelist) { + BObject patho = fo.getMap().get("path"); + if (patho != null) { + List l = patho.getList(); // one file may have several names + for (BObject fl: l) filenames.append(fl.toString()).append(" "); + } + } + } + BObject nameo = info.get("name"); + if (nameo != null) name = nameo.getString(); } - String name = info.get("name").getString(); try { return new Document( location,