diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index f59b54b34..300b2640e 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -30,6 +30,7 @@ // if the shell's current path is HTROOT import java.io.File; +import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; import java.util.HashMap; @@ -37,11 +38,11 @@ import java.util.Iterator; import java.util.Set; import net.yacy.document.Document; +import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; -import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; import de.anomic.data.BookmarkHelper; @@ -187,9 +188,9 @@ public class Bookmarks { // try to get the bookmark from the LURL database final URIMetadataRow urlentry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(urlHash.getBytes(), null, 0); Document document = null; - if (urlentry != null) { + if (urlentry != null) try { final URIMetadataRow.Components metadata = urlentry.metadata(); - document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE); + document = sb.loader.loadDocument(sb.loader.request(metadata.url(), true, false), CrawlProfile.CacheStrategy.IFEXIST, 5000, Long.MAX_VALUE); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", metadata.url().toNormalform(false, true)); prop.putHTML("mode_title", metadata.dc_title()); @@ -199,7 +200,7 @@ public class Bookmarks { prop.putHTML("mode_path",""); prop.put("mode_public", "0"); prop.put("mode_feed", "0"); //TODO: check if it IS a feed - } + } catch (IOException e) {Log.logException(e);} catch (ParserException e) {Log.logException(e);} if (document != null) document.close(); } else { // get from the bookmark database diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index 787570a9d..4c3b1bc69 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -63,7 +63,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon0Load")) { // load from the net try { - Response response = sb.loader.load(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file()); LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocalization(LibraryProvider.Dictionary.GEON0.file())); @@ -103,7 +103,7 @@ public class DictionaryLoader_p { if (post.containsKey("geo1Load")) { // load from the net try { - Response response = sb.loader.load(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file()); LibraryProvider.geoLoc.removeLocalization(LibraryProvider.Dictionary.GEODB0.nickname); diff --git a/htroot/RSSLoader_p.java b/htroot/RSSLoader_p.java index 295715492..3098fa3b8 100644 --- a/htroot/RSSLoader_p.java +++ b/htroot/RSSLoader_p.java @@ -64,7 +64,7 @@ public class RSSLoader_p { // if the resource body was not cached we try to load it from web Response entry = null; try { - entry = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); } catch (final Exception e) { return prop; } diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index a2fba71d1..8719583aa 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -25,7 +25,6 @@ //javac -classpath .:../Classes Status.java //if the shell's current path is HTROOT -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; @@ -43,13 +42,11 @@ import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Response; import de.anomic.http.client.Cache; import de.anomic.http.server.RequestHeader; -import de.anomic.http.server.ResponseHeader; import de.anomic.search.Segment; import de.anomic.search.Segments; import de.anomic.search.Switchboard; @@ -168,35 +165,22 @@ public class ViewFile { // loading the resource content as byte array prop.put("error_incache", Cache.has(url) ? 1 : 0); - String resMime = null; - ResponseHeader responseHeader = responseHeader = Cache.getResponseHeader(url);; - byte[] resource = Cache.getContent(url); - - if ((resource == null || responseHeader == null) && authorized) { - // load resource from net - Response response = null; - try { - response = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE); - } catch (IOException e) { - prop.put("error", "4"); - prop.putHTML("error_errorText", e.getMessage()); - prop.put("viewMode", VIEW_MODE_NO_TEXT); - return prop; - } - if (response != null) { - resource = response.getContent(); - responseHeader = response.getResponseHeader(); - } + Response response = null; + try { + response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CrawlProfile.CacheStrategy.IFEXIST : CrawlProfile.CacheStrategy.CACHEONLY, Long.MAX_VALUE); + } catch (IOException e) { + prop.put("error", "4"); + prop.put("error_errorText", "error loading resource: " + e.getMessage()); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; } - // if resource not available just fail - if (resource == null || responseHeader == null) { + if (response == null) { prop.put("error", "4"); prop.put("error_errorText", "No resource available"); prop.put("viewMode", VIEW_MODE_NO_TEXT); return prop; } - resMime = responseHeader.mime(); final String[] wordArray = wordArray(post.get("words", null)); @@ -205,14 +189,12 @@ public class ViewFile { // TODO: how to handle very large files here ? String content; try { - content = new String(resource, "UTF-8"); + content = new String(response.getContent(), "UTF-8"); } catch (final Exception e) { prop.put("error", "4"); prop.putHTML("error_errorText", e.getMessage()); prop.put("viewMode", VIEW_MODE_NO_TEXT); return prop; - } finally { - resource = null; } prop.put("error", "0"); @@ -231,7 +213,7 @@ public class ViewFile { // parsing the resource content Document document = null; try { - document = LoaderDispatcher.parseDocument(url, resource.length, new ByteArrayInputStream(resource), responseHeader); + document = response.parse(); if (document == null) { prop.put("error", "5"); prop.put("error_errorText", "Unknown error"); @@ -243,11 +225,7 @@ public class ViewFile { prop.putHTML("error_errorText", e.getMessage()); prop.put("viewMode", VIEW_MODE_NO_TEXT); return prop; - } finally { - resource = null; } - - resMime = document.dc_format(); if (viewMode.equals("parsed")) { final String content = new String(document.getTextBytes()); @@ -352,8 +330,8 @@ public class ViewFile { prop.put("error_wordCount", wordCount); prop.putHTML("error_desc", descr); prop.putNum("error_size", size); - prop.put("error_mimeTypeAvailable", (resMime == null) ? "0" : "1"); - prop.put("error_mimeTypeAvailable_mimeType", resMime); + prop.put("error_mimeTypeAvailable", (response.getMimeType() == null) ? "0" : "1"); + prop.put("error_mimeTypeAvailable_mimeType", response.getMimeType()); return prop; } diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 92e62c2b3..3eeef1e4e 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -84,14 +84,13 @@ public class ViewImage { int height = post.getInt("height", 0); int maxwidth = post.getInt("maxwidth", 0); int maxheight = post.getInt("maxheight", 0); - final int timeout = post.getInt("timeout", 5000); // get the image as stream Image scaled = iconcache.get(urlString); if (scaled == null) { byte[] resourceb = null; if (url != null) try { - resourceb = sb.loader.getResource(url, CrawlProfile.CacheStrategy.IFEXIST, timeout, false, true); + resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CrawlProfile.CacheStrategy.IFEXIST); } catch (IOException e) { Log.logWarning("ViewImage", "cannot load: " + e.getMessage()); } diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index 7bd69c4a0..e722e7d7d 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -6,7 +6,6 @@ import java.util.Set; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.parser.html.ContentScraper; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; import de.anomic.http.server.RequestHeader; @@ -55,14 +54,9 @@ public class getpageinfo_p { } ContentScraper scraper = null; if (u != null) try { - scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CacheStrategy.IFFRESH); + scraper = sb.loader.parseResource(u, CrawlProfile.CacheStrategy.IFFRESH); } catch (final IOException e) { - // try again, try harder - try { - scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CacheStrategy.IFEXIST); - } catch (final IOException ee) { - // now thats a fail, do nothing - } + // now thats a fail, do nothing } if (scraper != null) { // put the document title diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index a6b87c30c..7e3b3055e 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -36,6 +36,7 @@ import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSMessage; import net.yacy.document.Condenser; import net.yacy.document.Document; +import net.yacy.document.ParserException; import net.yacy.document.geolocalization.Location; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -49,7 +50,6 @@ import net.yacy.kelondro.util.Formatter; import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.SetTools; import net.yacy.kelondro.util.ISO639; -import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; import de.anomic.data.DidYouMean; @@ -428,8 +428,12 @@ public class yacysearch { final URIMetadataRow urlentry = indexSegment.urlMetadata().load(recommendHash.getBytes(), null, 0); if (urlentry != null) { final URIMetadataRow.Components metadata = urlentry.metadata(); - Document document; - document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE); + Document document = null; + try { + document = sb.loader.loadDocument(sb.loader.request(metadata.url(), true, false), CrawlProfile.CacheStrategy.IFEXIST, 5000, Long.MAX_VALUE); + } catch (IOException e) { + } catch (ParserException e) { + } if (document != null) { // create a news message final HashMap map = new HashMap(); diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 7ea57e023..1c1df1a20 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -26,9 +26,12 @@ package de.anomic.crawler.retrieval; +import java.io.ByteArrayInputStream; import java.util.Date; import net.yacy.document.Classification; +import net.yacy.document.Document; +import net.yacy.document.ParserException; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.DateFormatter; @@ -799,4 +802,16 @@ public class Response { return processCase; } + public Document parse() throws ParserException { + + String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); + if (supportError != null) throw new ParserException("no parser support:" + supportError, url()); + + try { + return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content.length, new ByteArrayInputStream(this.content)); + } catch (InterruptedException e) { + return null; + } + + } } diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java index 0cbd5ddcd..722a1840c 100644 --- a/source/de/anomic/search/MediaSnippet.java +++ b/source/de/anomic/search/MediaSnippet.java @@ -24,6 +24,7 @@ package de.anomic.search; +import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; @@ -35,13 +36,13 @@ import de.anomic.data.MimeTable; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Document; +import net.yacy.document.ParserException; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.ByteArray; -import net.yacy.repository.LoaderDispatcher; public class MediaSnippet implements Comparable, Comparator { @@ -118,7 +119,16 @@ public class MediaSnippet implements Comparable, Comparator(); } - final Document document = LoaderDispatcher.retrieveDocument(url, cacheStrategy, timeout, false, reindexing, Long.MAX_VALUE); + Document document; + try { + document = Switchboard.getSwitchboard().loader.loadDocument(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, timeout, Long.MAX_VALUE); + } catch (IOException e) { + Log.logFine("snippet fetch", "load error: " + e.getMessage()); + return new ArrayList(); + } catch (ParserException e) { + Log.logFine("snippet fetch", "parser error: " + e.getMessage()); + return new ArrayList(); + } final ArrayList a = new ArrayList(); if (document != null) { if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, ContentDomain.AUDIO)); diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index 4ab21a73e..e0f7cf323 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -274,7 +274,19 @@ public final class QueryParams { return new String(sb); } - protected static final boolean matches(final String text, final HandleSet keyhashes) { + /** + * check if the given text matches with the query + * this checks inclusion and exclusion words + * @param text + * @return true if the query matches with the given text + */ + public final boolean matches(final String text) { + final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet()); + if (SetTools.anymatch(wordhashes, this.excludeHashes)) return false; + return SetTools.totalInclusion(this.queryHashes, wordhashes); + } + + protected static final boolean anymatch(final String text, final HandleSet keyhashes) { // returns true if any of the word hashes in keyhashes appear in the String text // to do this, all words in the string must be recognized and transcoded to word hashes final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet()); diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 61a06395c..d563aaa8f 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -428,9 +428,9 @@ public final class RankingProcess extends Thread { final String pagetitle = metadata.dc_title().toLowerCase(); // check exclusion - if ((QueryParams.matches(pagetitle, query.excludeHashes)) || - (QueryParams.matches(pageurl.toLowerCase(), query.excludeHashes)) || - (QueryParams.matches(pageauthor.toLowerCase(), query.excludeHashes))) { + if ((QueryParams.anymatch(pagetitle, query.excludeHashes)) || + (QueryParams.anymatch(pageurl.toLowerCase(), query.excludeHashes)) || + (QueryParams.anymatch(pageauthor.toLowerCase(), query.excludeHashes))) { continue; } diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index ac3e78145..e7e89988f 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -26,10 +26,8 @@ package de.anomic.search; -import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.Date; import java.util.Iterator; @@ -386,49 +384,34 @@ public class Segment { final URIMetadataRow.Components metadata = entry.metadata(); if (metadata == null || metadata.url() == null) return 0; - InputStream resourceContent = null; try { - // get the resource content - byte[] resourceb = null; - try { - resourceb = loader.getResource(metadata.url(), cacheStrategy, 10000, true, false); - } catch (IOException e) { - Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage()); - } - if (resourceb == null) { + // parse the resource + final Document document = loader.loadDocument(loader.request(metadata.url(), true, false), cacheStrategy, 10000, Long.MAX_VALUE); + if (document == null) { // delete just the url entry urlMetadata().remove(urlhash); return 0; - } else { - resourceContent = new ByteArrayInputStream(resourceb); - final long resourceContentLength = resourceb.length; - - // parse the resource - final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength, resourceContent, null); - - // get the word set - Set words = null; - try { - words = new Condenser(document, true, true).words().keySet(); - } catch (final UnsupportedEncodingException e) { - Log.logException(e); - } - - // delete all word references - int count = 0; - if (words != null) count = termIndex().remove(Word.words2hashesHandles(words), urlhash); - - // finally delete the url entry itself - urlMetadata().remove(urlhash); - return count; } + // get the word set + Set words = null; + try { + words = new Condenser(document, true, true).words().keySet(); + } catch (final UnsupportedEncodingException e) { + Log.logException(e); + } + + // delete all word references + int count = 0; + if (words != null) count = termIndex().remove(Word.words2hashesHandles(words), urlhash); + + // finally delete the url entry itself + urlMetadata().remove(urlhash); + return count; } catch (final ParserException e) { return 0; } catch (IOException e) { Log.logException(e); return 0; - } finally { - if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */} } } diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index 091ba4b52..718d14f2b 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -24,7 +24,6 @@ package de.anomic.search; -import java.io.ByteArrayInputStream; import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; @@ -51,7 +50,6 @@ import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Response; import de.anomic.http.client.Cache; -import de.anomic.http.server.ResponseHeader; import de.anomic.yacy.yacySearch; public class TextSnippet implements Comparable, Comparator { @@ -331,8 +329,7 @@ public class TextSnippet implements Comparable, Comparator, Comparator, Comparator, Comparator anchors = scraper.getAnchors(); // a url (String) / name (String) relation final TreeSet mainReleases = new TreeSet(); final TreeSet devReleases = new TreeSet(); diff --git a/source/net/yacy/document/importer/OAIListFriendsLoader.java b/source/net/yacy/document/importer/OAIListFriendsLoader.java index 99c559c48..c2390ef44 100644 --- a/source/net/yacy/document/importer/OAIListFriendsLoader.java +++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java @@ -81,7 +81,7 @@ public class OAIListFriendsLoader { Map m; for (Map.Entry oaiFriend: listFriends.entrySet()) try { if (!oaiFriend.getValue().exists()) { - Response response = loader == null ? null : loader.load(new DigestURI(oaiFriend.getKey(), null), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey(), null), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue()); } diff --git a/source/net/yacy/document/importer/OAIPMHLoader.java b/source/net/yacy/document/importer/OAIPMHLoader.java index 4a8c5b700..15ba542b9 100644 --- a/source/net/yacy/document/importer/OAIPMHLoader.java +++ b/source/net/yacy/document/importer/OAIPMHLoader.java @@ -48,7 +48,7 @@ public class OAIPMHLoader { this.source = source; // load the file from the net - Response response = loader.load(source, false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = loader.load(loader.request(source, false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); byte[] b = response.getContent(); this.resumptionToken = new ResumptionToken(source, b); //System.out.println("*** ResumptionToken = " + this.resumptionToken.toString()); diff --git a/source/net/yacy/kelondro/util/SetTools.java b/source/net/yacy/kelondro/util/SetTools.java index 867cf6cb2..30ef3987b 100644 --- a/source/net/yacy/kelondro/util/SetTools.java +++ b/source/net/yacy/kelondro/util/SetTools.java @@ -67,7 +67,7 @@ public class SetTools { // - join by iterative tests (where we distinguish left-right and right-left tests) - public static TreeMap joinConstructive(final Collection> maps, final boolean concatStrings) { + public final static TreeMap joinConstructive(final Collection> maps, final boolean concatStrings) { // this joins all TreeMap(s) contained in maps // first order entities by their size @@ -109,7 +109,7 @@ public class SetTools { return joinResult; } - public static TreeMap joinConstructive(final TreeMap map1, final TreeMap map2, final boolean concatStrings) { + public final static TreeMap joinConstructive(final TreeMap map1, final TreeMap map2, final boolean concatStrings) { // comparators must be equal if ((map1 == null) || (map2 == null)) return null; if (map1.comparator() != map2.comparator()) return null; @@ -130,7 +130,7 @@ public class SetTools { } @SuppressWarnings("unchecked") - private static TreeMap joinConstructiveByTest(final TreeMap small, final TreeMap large, final boolean concatStrings) { + private final static TreeMap joinConstructiveByTest(final TreeMap small, final TreeMap large, final boolean concatStrings) { final Iterator> mi = small.entrySet().iterator(); final TreeMap result = new TreeMap(large.comparator()); Map.Entry mentry1; @@ -150,7 +150,7 @@ public class SetTools { } @SuppressWarnings("unchecked") - private static TreeMap joinConstructiveByEnumeration(final TreeMap map1, final TreeMap map2, final boolean concatStrings) { + private final static TreeMap joinConstructiveByEnumeration(final TreeMap map1, final TreeMap map2, final boolean concatStrings) { // implement pairwise enumeration final Comparator comp = map1.comparator(); final Iterator> mi1 = map1.entrySet().iterator(); @@ -181,7 +181,7 @@ public class SetTools { } // now the same for set-set - public static TreeSet joinConstructive(final TreeSet set1, final TreeSet set2) { + public final static TreeSet joinConstructive(final TreeSet set1, final TreeSet set2) { // comparators must be equal if ((set1 == null) || (set2 == null)) return null; if (set1.comparator() != set2.comparator()) return null; @@ -201,7 +201,7 @@ public class SetTools { return joinConstructiveByEnumeration(set1, set2); } - private static TreeSet joinConstructiveByTest(final TreeSet small, final TreeSet large) { + private final static TreeSet joinConstructiveByTest(final TreeSet small, final TreeSet large) { final Iterator mi = small.iterator(); final TreeSet result = new TreeSet(small.comparator()); A o; @@ -212,7 +212,7 @@ public class SetTools { return result; } - private static TreeSet joinConstructiveByEnumeration(final TreeSet set1, final TreeSet set2) { + private final static TreeSet joinConstructiveByEnumeration(final TreeSet set1, final TreeSet set2) { // implement pairwise enumeration final Comparator comp = set1.comparator(); final Iterator mi = set1.iterator(); @@ -238,8 +238,41 @@ public class SetTools { return result; } - // now the same for set-set - public static boolean anymatch(final TreeSet set1, final TreeSet set2) { + /** + * test if one set is totally included in another set + * @param + * @param small + * @param large + * @return true if the small set is completely included in the large set + */ + public final static boolean totalInclusion(final Set small, final Set large) { + for (A o: small) { + if (!large.contains(o)) return false; + } + return true; + } + + /** + * test if one set is totally included in another set + * @param small + * @param large + * @return true if the small set is completely included in the large set + */ + public final static boolean totalInclusion(final HandleSet small, final HandleSet large) { + for (byte[] handle: small) { + if (!large.has(handle)) return false; + } + return true; + } + + /** + * test if the intersection of two sets is not empty + * @param + * @param set1 + * @param set2 + * @return true if any element of the first set is part of the second set or vice-versa + */ + public final static boolean anymatch(final TreeSet set1, final TreeSet set2) { // comparators must be equal if ((set1 == null) || (set2 == null)) return false; if (set1.comparator() != set2.comparator()) return false; @@ -259,7 +292,13 @@ public class SetTools { return anymatchByEnumeration(set1, set2); } - public static boolean anymatch(final HandleSet set1, final HandleSet set2) { + /** + * test if the intersection of two sets is not empty + * @param set1 + * @param set2 + * @return true if any element of the first set is part of the second set or vice-versa + */ + public final static boolean anymatch(final HandleSet set1, final HandleSet set2) { // comparators must be equal if ((set1 == null) || (set2 == null)) return false; if (set1.comparator() != set2.comparator()) return false; @@ -279,7 +318,7 @@ public class SetTools { return anymatchByEnumeration(set1, set2); } - private static boolean anymatchByTest(final TreeSet small, final TreeSet large) { + private final static boolean anymatchByTest(final TreeSet small, final TreeSet large) { final Iterator mi = small.iterator(); A o; while (mi.hasNext()) { @@ -289,7 +328,7 @@ public class SetTools { return false; } - private static boolean anymatchByTest(final HandleSet small, final HandleSet large) { + private final static boolean anymatchByTest(final HandleSet small, final HandleSet large) { final Iterator mi = small.iterator(); byte[] o; while (mi.hasNext()) { @@ -299,7 +338,7 @@ public class SetTools { return false; } - private static boolean anymatchByEnumeration(final TreeSet set1, final TreeSet set2) { + private final static boolean anymatchByEnumeration(final TreeSet set1, final TreeSet set2) { // implement pairwise enumeration final Comparator comp = set1.comparator(); final Iterator mi = set1.iterator(); @@ -322,7 +361,7 @@ public class SetTools { return false; } - private static boolean anymatchByEnumeration(final HandleSet set1, final HandleSet set2) { + private final static boolean anymatchByEnumeration(final HandleSet set1, final HandleSet set2) { // implement pairwise enumeration final Comparator comp = set1.comparator(); final Iterator mi = set1.iterator(); @@ -370,7 +409,7 @@ public class SetTools { } */ - public static void excludeDestructive(final Map map, final Set set) { + public final static void excludeDestructive(final Map map, final Set set) { // comparators must be equal if (map == null) return; if (set == null) return; @@ -383,18 +422,18 @@ public class SetTools { excludeDestructiveByTestSetInMap(map, set); } - private static void excludeDestructiveByTestMapInSet(final Map map, final Set set) { + private final static void excludeDestructiveByTestMapInSet(final Map map, final Set set) { final Iterator mi = map.keySet().iterator(); while (mi.hasNext()) if (set.contains(mi.next())) mi.remove(); } - private static void excludeDestructiveByTestSetInMap(final Map map, final Set set) { + private final static void excludeDestructiveByTestSetInMap(final Map map, final Set set) { final Iterator si = set.iterator(); while (si.hasNext()) map.remove(si.next()); } // and the same again with set-set - public static void excludeDestructive(final Set set1, final Set set2) { + public final static void excludeDestructive(final Set set1, final Set set2) { if (set1 == null) return; if (set2 == null) return; assert !(set1 instanceof TreeSet && set2 instanceof TreeSet) || ((TreeSet) set1).comparator() == ((TreeSet) set2).comparator(); @@ -406,19 +445,19 @@ public class SetTools { excludeDestructiveByTestLargeInSmall(set1, set2); } - private static void excludeDestructiveByTestSmallInLarge(final Set small, final Set large) { + private final static void excludeDestructiveByTestSmallInLarge(final Set small, final Set large) { final Iterator mi = small.iterator(); while (mi.hasNext()) if (large.contains(mi.next())) mi.remove(); } - private static void excludeDestructiveByTestLargeInSmall(final Set large, final Set small) { + private final static void excludeDestructiveByTestLargeInSmall(final Set large, final Set small) { final Iterator si = small.iterator(); while (si.hasNext()) large.remove(si.next()); } // ------------------------------------------------------------------------------------------------ - public static TreeMap loadMap(final String filename, final String sep) { + public final static TreeMap loadMap(final String filename, final String sep) { final TreeMap map = new TreeMap(); BufferedReader br = null; try { @@ -437,7 +476,7 @@ public class SetTools { return map; } - public static TreeMap> loadMapMultiValsPerKey(final String filename, final String sep) { + public final static TreeMap> loadMapMultiValsPerKey(final String filename, final String sep) { final TreeMap> map = new TreeMap>(); BufferedReader br = null; try { @@ -460,7 +499,7 @@ public class SetTools { return map; } - public static TreeSet loadList(final File file, final Comparator c) { + public final static TreeSet loadList(final File file, final Comparator c) { final TreeSet list = new TreeSet(c); if (!(file.exists())) return list; @@ -480,7 +519,7 @@ public class SetTools { return list; } - public static String setToString(final HandleSet set, final char separator) { + public final static String setToString(final HandleSet set, final char separator) { final Iterator i = set.iterator(); final StringBuilder sb = new StringBuilder(set.size() * 7); if (i.hasNext()) sb.append(new String(i.next())); @@ -490,7 +529,7 @@ public class SetTools { return sb.toString(); } - public static String setToString(final Set set, final char separator) { + public final static String setToString(final Set set, final char separator) { final Iterator i = set.iterator(); final StringBuilder sb = new StringBuilder(set.size() * 7); if (i.hasNext()) sb.append(i.next()); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index cabb7514a..cc8bd07b0 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -29,7 +29,6 @@ package net.yacy.repository; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.io.Writer; import java.net.MalformedURLException; import java.util.Arrays; @@ -39,6 +38,7 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Document; import net.yacy.document.TextParser; import net.yacy.document.ParserException; @@ -57,7 +57,6 @@ import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.SMBLoader; import de.anomic.http.client.Cache; -import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.http.server.ResponseHeader; @@ -98,38 +97,7 @@ public final class LoaderDispatcher { public HashSet getSupportedProtocols() { return (HashSet) this.supportedProtocols.clone(); } - - /** - * load a resource from the web, from ftp, from smb or a file - * @param url - * @param forText shows that this was a for-text crawling request - * @param global shows that this was a global crawling request - * @param cacheStratgy strategy according to CACHE_STRATEGY_NOCACHE,CACHE_STRATEGY_IFFRESH,CACHE_STRATEGY_IFEXIST,CACHE_STRATEGY_CACHEONLY - * @return the loaded entity in a Response object - * @throws IOException - */ - public Response load( - final DigestURI url, - final boolean forText, - final boolean global, - CrawlProfile.CacheStrategy cacheStratgy, - long maxFileSize) throws IOException { - return load(request(url, forText, global), cacheStratgy, maxFileSize); - } - - public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException { - byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent(); - if (b == null) throw new IOException("load == null"); - File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); - - // transaction-safe writing - File parent = targetFile.getParentFile(); - if (!parent.exists()) parent.mkdirs(); - FileUtils.copy(b, tmp); - tmp.renameTo(targetFile); - } - /** * generate a request object * @param url the target url @@ -160,7 +128,27 @@ public final class LoaderDispatcher { 0, 0); } + + public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException { + + byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent(); + if (b == null) throw new IOException("load == null"); + File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); + + // transaction-safe writing + File parent = targetFile.getParentFile(); + if (!parent.exists()) parent.mkdirs(); + FileUtils.copy(b, tmp); + tmp.renameTo(targetFile); + } + /** + * load a resource from the web, from ftp, from smb or a file + * @param request the request essentials + * @param cacheStratgy strategy according to CACHE_STRATEGY_NOCACHE,CACHE_STRATEGY_IFFRESH,CACHE_STRATEGY_IFEXIST,CACHE_STRATEGY_CACHEONLY + * @return the loaded entity in a Response object + * @throws IOException + */ public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { // get the protocol of the next URL final String protocol = request.url().getProtocol(); @@ -272,132 +260,40 @@ public final class LoaderDispatcher { } /** - * load the url as resource from the web or the cache - * @param url - * @param fetchOnline - * @param socketTimeout - * @param forText + * load the url as byte[] content from the web or the cache + * @param request + * @param cacheStrategy + * @param timeout * @return the content as {@link byte[]} * @throws IOException */ - public byte[] getResource(final DigestURI url, CrawlProfile.CacheStrategy cacheStrategy, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException { + public byte[] loadContent(final Request request, CrawlProfile.CacheStrategy cacheStrategy) throws IOException { // try to download the resource using the loader final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); - final Response entry = load(url, forText, reindexing, cacheStrategy, maxFileSize); + final Response entry = load(request, cacheStrategy, maxFileSize); if (entry == null) return null; // not found in web // read resource body (if it is there) return entry.getContent(); } - /** - * Tries to load and parse a resource specified by it's URL. - * If the resource is not stored in cache and if fetchOnline is set the - * this function tries to download the resource from web. - * - * @param url the URL of the resource - * @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache - * @param timeout - * @param forText - * @param global the domain of the search. If global == true then the content is re-indexed - * @return the parsed document as {@link Document} - */ - public static Document retrieveDocument(final DigestURI url, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final boolean forText, final boolean global, long maxFileSize) { + public Document loadDocument(final Request request, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, long maxFileSize) throws IOException, ParserException { // load resource - byte[] resContent = null; - ResponseHeader responseHeader = null; - try { - final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, cacheStrategy, maxFileSize); - if (entry == null) { - Log.logFine("snippet fetch", "no Response for url " + url); - return null; - } + final Response response = load(request, cacheStrategy, maxFileSize); + if (response == null) throw new IOException("no Response for url " + request.url()); - // read resource body (if it is there) - resContent = entry.getContent(); - - // read a fresh header - responseHeader = entry.getResponseHeader(); - - // if it is still not available, report an error - if (resContent == null || responseHeader == null) { - Log.logFine("snippet fetch", "no Content available for url " + url); - return null; - } - } catch (final Exception e) { - Log.logFine("snippet fetch", "error loading resource: " + e.getMessage() + " for url " + url); - return null; - } + // if it is still not available, report an error + if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + request.url()); // parse resource - Document document = null; - try { - document = parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader); - } catch (final ParserException e) { - Log.logFine("snippet fetch", "parser error " + e.getMessage() + " for url " + url); - return null; - } finally { - resContent = null; - } - return document; - } - - /** - * Parse the resource - * @param url the URL of the resource - * @param contentLength the contentLength of the resource - * @param resourceStream the resource body as stream - * @param docInfo metadata about the resource - * @return the extracted data - * @throws ParserException - */ - public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream, ResponseHeader responseHeader) throws ParserException { - try { - if (resourceStream == null) return null; - - // STEP 1: if no resource metadata is available, try to load it from cache - if (responseHeader == null) { - // try to get the header from the htcache directory - try { - responseHeader = Cache.getResponseHeader(url); - } catch (final Exception e) { - // ignore this. resource info loading failed - } - } - - // STEP 2: if the metadata is still null try to download it from web - if ((responseHeader == null) && (url.getProtocol().startsWith("http"))) { - // TODO: we need a better solution here - // e.g. encapsulate this in the crawlLoader class - - // getting URL mimeType - try { - responseHeader = Client.whead(url.toString()); - } catch (final Exception e) { - // ingore this. http header download failed - } - } - - // STEP 3: if the metadata is still null try to guess the mimeType of the resource - String supportError = TextParser.supports(url, responseHeader == null ? null : responseHeader.mime()); - if (supportError != null) { - return null; - } - if (responseHeader == null) { - return TextParser.parseSource(url, null, null, contentLength, resourceStream); - } - return TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream); - } catch (final InterruptedException e) { - // interruption of thread detected - return null; - } + return response.parse(); } - public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException { + public ContentScraper parseResource(final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException { // load page - final long maxFileSize = loader.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); - Response r = loader.load(location, true, false, cachePolicy, maxFileSize); + final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); + Response r = this.load(request(location, true, false), cachePolicy, maxFileSize); byte[] page = (r == null) ? null : r.getContent(); if (page == null) throw new IOException("no response from url " + location.toString()); @@ -409,6 +305,40 @@ public final class LoaderDispatcher { return scraper; } + /** + * load all links from a resource + * @param url the url that shall be loaded + * @param cacheStrategy the cache strategy + * @return a map from URLs to the anchor texts of the urls + * @throws IOException + */ + public final Map loadLinks(DigestURI url, CrawlProfile.CacheStrategy cacheStrategy) throws IOException { + Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE); + if (response == null) throw new IOException("response == null"); + ResponseHeader responseHeader = response.getResponseHeader(); + byte[] resource = response.getContent(); + if (resource == null) throw new IOException("resource == null"); + if (responseHeader == null) throw new IOException("responseHeader == null"); + + Document document = null; + String supportError = TextParser.supports(url, responseHeader.mime()); + if (supportError != null) throw new IOException("no parser support: " + supportError); + try { + document = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), resource.length, new ByteArrayInputStream(resource)); + if (document == null) throw new IOException("document == null"); + } catch (final ParserException e) { + throw new IOException("parser error: " + e.getMessage()); + } catch (InterruptedException e) { + throw new IOException("interrupted"); + } finally { + resource = null; + } + + Map result = document.getHyperlinks(); + document.close(); + return result; + } + public synchronized void cleanupAccessTimeTable(long timeout) { final Iterator> i = accessTime.entrySet().iterator(); Map.Entry e; @@ -439,7 +369,7 @@ public final class LoaderDispatcher { if (this.cache.exists()) return; try { // load from the net - Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, this.maxFileSize); + Response response = load(request(new DigestURI(this.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, this.maxFileSize); byte[] b = response.getContent(); FileUtils.copy(b, this.cache); } catch (MalformedURLException e) {} catch (IOException e) {}