more abstraction for access of LoaderDispatcher and cache

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6937 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 7bcfa033c9
commit 777195e8d1

@ -30,6 +30,7 @@
// if the shell's current path is HTROOT
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashMap;
@ -37,11 +38,11 @@ import java.util.Iterator;
import java.util.Set;
import net.yacy.document.Document;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.BookmarkHelper;
@ -187,9 +188,9 @@ public class Bookmarks {
// try to get the bookmark from the LURL database
final URIMetadataRow urlentry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(urlHash.getBytes(), null, 0);
Document document = null;
if (urlentry != null) {
if (urlentry != null) try {
final URIMetadataRow.Components metadata = urlentry.metadata();
document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE);
document = sb.loader.loadDocument(sb.loader.request(metadata.url(), true, false), CrawlProfile.CacheStrategy.IFEXIST, 5000, Long.MAX_VALUE);
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", metadata.url().toNormalform(false, true));
prop.putHTML("mode_title", metadata.dc_title());
@ -199,7 +200,7 @@ public class Bookmarks {
prop.putHTML("mode_path","");
prop.put("mode_public", "0");
prop.put("mode_feed", "0"); //TODO: check if it IS a feed
}
} catch (IOException e) {Log.logException(e);} catch (ParserException e) {Log.logException(e);}
if (document != null) document.close();
} else {
// get from the bookmark database

@ -63,7 +63,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon0Load")) {
// load from the net
try {
Response response = sb.loader.load(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocalization(LibraryProvider.Dictionary.GEON0.file()));
@ -103,7 +103,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geo1Load")) {
// load from the net
try {
Response response = sb.loader.load(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
LibraryProvider.geoLoc.removeLocalization(LibraryProvider.Dictionary.GEODB0.nickname);

@ -64,7 +64,7 @@ public class RSSLoader_p {
// if the resource body was not cached we try to load it from web
Response entry = null;
try {
entry = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
} catch (final Exception e) {
return prop;
}

@ -25,7 +25,6 @@
//javac -classpath .:../Classes Status.java
//if the shell's current path is HTROOT
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
@ -43,13 +42,11 @@ import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Cache;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseHeader;
import de.anomic.search.Segment;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -168,35 +165,22 @@ public class ViewFile {
// loading the resource content as byte array
prop.put("error_incache", Cache.has(url) ? 1 : 0);
String resMime = null;
ResponseHeader responseHeader = responseHeader = Cache.getResponseHeader(url);;
byte[] resource = Cache.getContent(url);
if ((resource == null || responseHeader == null) && authorized) {
// load resource from net
Response response = null;
try {
response = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
} catch (IOException e) {
prop.put("error", "4");
prop.putHTML("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
if (response != null) {
resource = response.getContent();
responseHeader = response.getResponseHeader();
}
Response response = null;
try {
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CrawlProfile.CacheStrategy.IFEXIST : CrawlProfile.CacheStrategy.CACHEONLY, Long.MAX_VALUE);
} catch (IOException e) {
prop.put("error", "4");
prop.put("error_errorText", "error loading resource: " + e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
// if resource not available just fail
if (resource == null || responseHeader == null) {
if (response == null) {
prop.put("error", "4");
prop.put("error_errorText", "No resource available");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
resMime = responseHeader.mime();
final String[] wordArray = wordArray(post.get("words", null));
@ -205,14 +189,12 @@ public class ViewFile {
// TODO: how to handle very large files here ?
String content;
try {
content = new String(resource, "UTF-8");
content = new String(response.getContent(), "UTF-8");
} catch (final Exception e) {
prop.put("error", "4");
prop.putHTML("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
} finally {
resource = null;
}
prop.put("error", "0");
@ -231,7 +213,7 @@ public class ViewFile {
// parsing the resource content
Document document = null;
try {
document = LoaderDispatcher.parseDocument(url, resource.length, new ByteArrayInputStream(resource), responseHeader);
document = response.parse();
if (document == null) {
prop.put("error", "5");
prop.put("error_errorText", "Unknown error");
@ -243,11 +225,7 @@ public class ViewFile {
prop.putHTML("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
} finally {
resource = null;
}
resMime = document.dc_format();
if (viewMode.equals("parsed")) {
final String content = new String(document.getTextBytes());
@ -352,8 +330,8 @@ public class ViewFile {
prop.put("error_wordCount", wordCount);
prop.putHTML("error_desc", descr);
prop.putNum("error_size", size);
prop.put("error_mimeTypeAvailable", (resMime == null) ? "0" : "1");
prop.put("error_mimeTypeAvailable_mimeType", resMime);
prop.put("error_mimeTypeAvailable", (response.getMimeType() == null) ? "0" : "1");
prop.put("error_mimeTypeAvailable_mimeType", response.getMimeType());
return prop;
}

@ -84,14 +84,13 @@ public class ViewImage {
int height = post.getInt("height", 0);
int maxwidth = post.getInt("maxwidth", 0);
int maxheight = post.getInt("maxheight", 0);
final int timeout = post.getInt("timeout", 5000);
// get the image as stream
Image scaled = iconcache.get(urlString);
if (scaled == null) {
byte[] resourceb = null;
if (url != null) try {
resourceb = sb.loader.getResource(url, CrawlProfile.CacheStrategy.IFEXIST, timeout, false, true);
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CrawlProfile.CacheStrategy.IFEXIST);
} catch (IOException e) {
Log.logWarning("ViewImage", "cannot load: " + e.getMessage());
}

@ -6,7 +6,6 @@ import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.server.RequestHeader;
@ -55,14 +54,9 @@ public class getpageinfo_p {
}
ContentScraper scraper = null;
if (u != null) try {
scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CacheStrategy.IFFRESH);
scraper = sb.loader.parseResource(u, CrawlProfile.CacheStrategy.IFFRESH);
} catch (final IOException e) {
// try again, try harder
try {
scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CacheStrategy.IFEXIST);
} catch (final IOException ee) {
// now thats a fail, do nothing
}
// now thats a fail, do nothing
}
if (scraper != null) {
// put the document title

@ -36,6 +36,7 @@ import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.ParserException;
import net.yacy.document.geolocalization.Location;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -49,7 +50,6 @@ import net.yacy.kelondro.util.Formatter;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.SetTools;
import net.yacy.kelondro.util.ISO639;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.DidYouMean;
@ -428,8 +428,12 @@ public class yacysearch {
final URIMetadataRow urlentry = indexSegment.urlMetadata().load(recommendHash.getBytes(), null, 0);
if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata();
Document document;
document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE);
Document document = null;
try {
document = sb.loader.loadDocument(sb.loader.request(metadata.url(), true, false), CrawlProfile.CacheStrategy.IFEXIST, 5000, Long.MAX_VALUE);
} catch (IOException e) {
} catch (ParserException e) {
}
if (document != null) {
// create a news message
final HashMap<String, String> map = new HashMap<String, String>();

@ -26,9 +26,12 @@
package de.anomic.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.util.Date;
import net.yacy.document.Classification;
import net.yacy.document.Document;
import net.yacy.document.ParserException;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.DateFormatter;
@ -799,4 +802,16 @@ public class Response {
return processCase;
}
public Document parse() throws ParserException {
String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new ParserException("no parser support:" + supportError, url());
try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content.length, new ByteArrayInputStream(this.content));
} catch (InterruptedException e) {
return null;
}
}
}

@ -24,6 +24,7 @@
package de.anomic.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
@ -35,13 +36,13 @@ import de.anomic.data.MimeTable;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Document;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.repository.LoaderDispatcher;
public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
@ -118,7 +119,16 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
return new ArrayList<MediaSnippet>();
}
final Document document = LoaderDispatcher.retrieveDocument(url, cacheStrategy, timeout, false, reindexing, Long.MAX_VALUE);
Document document;
try {
document = Switchboard.getSwitchboard().loader.loadDocument(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, timeout, Long.MAX_VALUE);
} catch (IOException e) {
Log.logFine("snippet fetch", "load error: " + e.getMessage());
return new ArrayList<MediaSnippet>();
} catch (ParserException e) {
Log.logFine("snippet fetch", "parser error: " + e.getMessage());
return new ArrayList<MediaSnippet>();
}
final ArrayList<MediaSnippet> a = new ArrayList<MediaSnippet>();
if (document != null) {
if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, ContentDomain.AUDIO));

@ -274,7 +274,19 @@ public final class QueryParams {
return new String(sb);
}
protected static final boolean matches(final String text, final HandleSet keyhashes) {
/**
* check if the given text matches with the query
* this checks inclusion and exclusion words
* @param text
* @return true if the query matches with the given text
*/
public final boolean matches(final String text) {
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());
if (SetTools.anymatch(wordhashes, this.excludeHashes)) return false;
return SetTools.totalInclusion(this.queryHashes, wordhashes);
}
protected static final boolean anymatch(final String text, final HandleSet keyhashes) {
// returns true if any of the word hashes in keyhashes appear in the String text
// to do this, all words in the string must be recognized and transcoded to word hashes
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());

@ -428,9 +428,9 @@ public final class RankingProcess extends Thread {
final String pagetitle = metadata.dc_title().toLowerCase();
// check exclusion
if ((QueryParams.matches(pagetitle, query.excludeHashes)) ||
(QueryParams.matches(pageurl.toLowerCase(), query.excludeHashes)) ||
(QueryParams.matches(pageauthor.toLowerCase(), query.excludeHashes))) {
if ((QueryParams.anymatch(pagetitle, query.excludeHashes)) ||
(QueryParams.anymatch(pageurl.toLowerCase(), query.excludeHashes)) ||
(QueryParams.anymatch(pageauthor.toLowerCase(), query.excludeHashes))) {
continue;
}

@ -26,10 +26,8 @@
package de.anomic.search;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.Iterator;
@ -386,49 +384,34 @@ public class Segment {
final URIMetadataRow.Components metadata = entry.metadata();
if (metadata == null || metadata.url() == null) return 0;
InputStream resourceContent = null;
try {
// get the resource content
byte[] resourceb = null;
try {
resourceb = loader.getResource(metadata.url(), cacheStrategy, 10000, true, false);
} catch (IOException e) {
Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
}
if (resourceb == null) {
// parse the resource
final Document document = loader.loadDocument(loader.request(metadata.url(), true, false), cacheStrategy, 10000, Long.MAX_VALUE);
if (document == null) {
// delete just the url entry
urlMetadata().remove(urlhash);
return 0;
} else {
resourceContent = new ByteArrayInputStream(resourceb);
final long resourceContentLength = resourceb.length;
// parse the resource
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength, resourceContent, null);
// get the word set
Set<String> words = null;
try {
words = new Condenser(document, true, true).words().keySet();
} catch (final UnsupportedEncodingException e) {
Log.logException(e);
}
// delete all word references
int count = 0;
if (words != null) count = termIndex().remove(Word.words2hashesHandles(words), urlhash);
// finally delete the url entry itself
urlMetadata().remove(urlhash);
return count;
}
// get the word set
Set<String> words = null;
try {
words = new Condenser(document, true, true).words().keySet();
} catch (final UnsupportedEncodingException e) {
Log.logException(e);
}
// delete all word references
int count = 0;
if (words != null) count = termIndex().remove(Word.words2hashesHandles(words), urlhash);
// finally delete the url entry itself
urlMetadata().remove(urlhash);
return count;
} catch (final ParserException e) {
return 0;
} catch (IOException e) {
Log.logException(e);
return 0;
} finally {
if (resourceContent != null) try { resourceContent.close(); } catch (final Exception e) {/* ignore this */}
}
}

@ -24,7 +24,6 @@
package de.anomic.search;
import java.io.ByteArrayInputStream;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
@ -51,7 +50,6 @@ import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Cache;
import de.anomic.http.server.ResponseHeader;
import de.anomic.yacy.yacySearch;
public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnippet> {
@ -331,8 +329,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
* LOADING RESOURCE DATA
* =========================================================================== */
// if the snippet is not in the cache, we can try to get it from the htcache
byte[] resContent = null;
ResponseHeader responseHeader = null;
Response response;
try {
// first try to get the snippet from metadata
String loc;
@ -350,31 +347,9 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(new String(url.hash())));
} else {
// trying to load the resource from the cache
resContent = Cache.getContent(url);
responseHeader = Cache.getResponseHeader(url);
if ((resContent == null || responseHeader == null) && cacheStrategy.isAllowedToFetchOnline()) {
// if not found try to download it
// download resource or get it from the cache
final Response entry = loader.load(url, true, reindexing, cacheStrategy, Long.MAX_VALUE);
// get resource metadata (e.g. the http headers for http resources)
if (entry != null) {
// place entry on indexing queue
Switchboard.getSwitchboard().toIndexer(entry);
// read resource body (if it is there)
final byte[] resourceArray = entry.getContent();
if (resourceArray != null) {
resContent = resourceArray;
} else {
resContent = Cache.getContent(url);
}
}
source = SOURCE_WEB;
}
if (resContent == null) {
boolean objectWasInCache = Cache.has(url);
response = loader.load(loader.request(url, true, reindexing), cacheStrategy, Long.MAX_VALUE);
if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy.mustBeOffline()) {
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "omitted network load (not allowed), no cache entry");
@ -383,6 +358,11 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// if it is still not available, report an error
return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource from net, no cache entry");
}
if (!objectWasInCache) {
// place entry on indexing queue
Switchboard.getSwitchboard().toIndexer(response);
source = SOURCE_WEB;
}
}
} catch (final Exception e) {
//Log.logException(e);
@ -394,11 +374,9 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
* =========================================================================== */
Document document = null;
try {
document = LoaderDispatcher.parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader);
document = response.parse();
} catch (final ParserException e) {
return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
} finally {
resContent = null;
}
if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed

@ -85,7 +85,7 @@ public class OSMTile {
// download resource using the crawler and keep resource in memory if possible
Response entry = null;
try {
entry = Switchboard.getSwitchboard().loader.load(tileURL, false, false, CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
} catch (IOException e) {
Log.logWarning("yamyOSM", "cannot load: " + e.getMessage());
return null;

@ -52,7 +52,6 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
@ -235,12 +234,12 @@ public final class yacyRelease extends yacyVersion {
// returns the version info if successful, null otherwise
ContentScraper scraper;
try {
scraper = LoaderDispatcher.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CacheStrategy.NOCACHE);
scraper = Switchboard.getSwitchboard().loader.parseResource(location.getLocationURL(), CrawlProfile.CacheStrategy.NOCACHE);
} catch (final IOException e) {
return null;
}
// analyse links in scraper resource, and find link to latest release in it
// analyze links in scraper resource, and find link to latest release in it
final Map<MultiProtocolURI, String> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
final TreeSet<yacyRelease> mainReleases = new TreeSet<yacyRelease>();
final TreeSet<yacyRelease> devReleases = new TreeSet<yacyRelease>();

@ -81,7 +81,7 @@ public class OAIListFriendsLoader {
Map<String, String> m;
for (Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
if (!oaiFriend.getValue().exists()) {
Response response = loader == null ? null : loader.load(new DigestURI(oaiFriend.getKey(), null), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey(), null), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
}

@ -48,7 +48,7 @@ public class OAIPMHLoader {
this.source = source;
// load the file from the net
Response response = loader.load(source, false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
Response response = loader.load(loader.request(source, false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(source, b);
//System.out.println("*** ResumptionToken = " + this.resumptionToken.toString());

@ -67,7 +67,7 @@ public class SetTools {
// - join by iterative tests (where we distinguish left-right and right-left tests)
public static <A, B> TreeMap<A, B> joinConstructive(final Collection<TreeMap<A, B>> maps, final boolean concatStrings) {
public final static <A, B> TreeMap<A, B> joinConstructive(final Collection<TreeMap<A, B>> maps, final boolean concatStrings) {
// this joins all TreeMap(s) contained in maps
// first order entities by their size
@ -109,7 +109,7 @@ public class SetTools {
return joinResult;
}
public static <A, B> TreeMap<A, B> joinConstructive(final TreeMap<A, B> map1, final TreeMap<A, B> map2, final boolean concatStrings) {
public final static <A, B> TreeMap<A, B> joinConstructive(final TreeMap<A, B> map1, final TreeMap<A, B> map2, final boolean concatStrings) {
// comparators must be equal
if ((map1 == null) || (map2 == null)) return null;
if (map1.comparator() != map2.comparator()) return null;
@ -130,7 +130,7 @@ public class SetTools {
}
@SuppressWarnings("unchecked")
private static <A, B> TreeMap<A, B> joinConstructiveByTest(final TreeMap<A, B> small, final TreeMap<A, B> large, final boolean concatStrings) {
private final static <A, B> TreeMap<A, B> joinConstructiveByTest(final TreeMap<A, B> small, final TreeMap<A, B> large, final boolean concatStrings) {
final Iterator<Map.Entry<A, B>> mi = small.entrySet().iterator();
final TreeMap<A, B> result = new TreeMap<A, B>(large.comparator());
Map.Entry<A, B> mentry1;
@ -150,7 +150,7 @@ public class SetTools {
}
@SuppressWarnings("unchecked")
private static <A, B> TreeMap<A, B> joinConstructiveByEnumeration(final TreeMap<A, B> map1, final TreeMap<A, B> map2, final boolean concatStrings) {
private final static <A, B> TreeMap<A, B> joinConstructiveByEnumeration(final TreeMap<A, B> map1, final TreeMap<A, B> map2, final boolean concatStrings) {
// implement pairwise enumeration
final Comparator<? super A> comp = map1.comparator();
final Iterator<Map.Entry<A, B>> mi1 = map1.entrySet().iterator();
@ -181,7 +181,7 @@ public class SetTools {
}
// now the same for set-set
public static <A> TreeSet<A> joinConstructive(final TreeSet<A> set1, final TreeSet<A> set2) {
public final static <A> TreeSet<A> joinConstructive(final TreeSet<A> set1, final TreeSet<A> set2) {
// comparators must be equal
if ((set1 == null) || (set2 == null)) return null;
if (set1.comparator() != set2.comparator()) return null;
@ -201,7 +201,7 @@ public class SetTools {
return joinConstructiveByEnumeration(set1, set2);
}
private static <A> TreeSet<A> joinConstructiveByTest(final TreeSet<A> small, final TreeSet<A> large) {
private final static <A> TreeSet<A> joinConstructiveByTest(final TreeSet<A> small, final TreeSet<A> large) {
final Iterator<A> mi = small.iterator();
final TreeSet<A> result = new TreeSet<A>(small.comparator());
A o;
@ -212,7 +212,7 @@ public class SetTools {
return result;
}
private static <A> TreeSet<A> joinConstructiveByEnumeration(final TreeSet<A> set1, final TreeSet<A> set2) {
private final static <A> TreeSet<A> joinConstructiveByEnumeration(final TreeSet<A> set1, final TreeSet<A> set2) {
// implement pairwise enumeration
final Comparator<? super A> comp = set1.comparator();
final Iterator<A> mi = set1.iterator();
@ -238,8 +238,41 @@ public class SetTools {
return result;
}
// now the same for set-set
public static <A> boolean anymatch(final TreeSet<A> set1, final TreeSet<A> set2) {
/**
* test if one set is totally included in another set
* @param <A>
* @param small
* @param large
* @return true if the small set is completely included in the large set
*/
public final static <A> boolean totalInclusion(final Set<A> small, final Set<A> large) {
for (A o: small) {
if (!large.contains(o)) return false;
}
return true;
}
/**
* test if one set is totally included in another set
* @param small
* @param large
* @return true if the small set is completely included in the large set
*/
public final static boolean totalInclusion(final HandleSet small, final HandleSet large) {
for (byte[] handle: small) {
if (!large.has(handle)) return false;
}
return true;
}
/**
* test if the intersection of two sets is not empty
* @param <A>
* @param set1
* @param set2
* @return true if any element of the first set is part of the second set or vice-versa
*/
public final static <A> boolean anymatch(final TreeSet<A> set1, final TreeSet<A> set2) {
// comparators must be equal
if ((set1 == null) || (set2 == null)) return false;
if (set1.comparator() != set2.comparator()) return false;
@ -259,7 +292,13 @@ public class SetTools {
return anymatchByEnumeration(set1, set2);
}
public static <A> boolean anymatch(final HandleSet set1, final HandleSet set2) {
/**
* test if the intersection of two sets is not empty
* @param set1
* @param set2
* @return true if any element of the first set is part of the second set or vice-versa
*/
public final static boolean anymatch(final HandleSet set1, final HandleSet set2) {
// comparators must be equal
if ((set1 == null) || (set2 == null)) return false;
if (set1.comparator() != set2.comparator()) return false;
@ -279,7 +318,7 @@ public class SetTools {
return anymatchByEnumeration(set1, set2);
}
private static <A> boolean anymatchByTest(final TreeSet<A> small, final TreeSet<A> large) {
private final static <A> boolean anymatchByTest(final TreeSet<A> small, final TreeSet<A> large) {
final Iterator<A> mi = small.iterator();
A o;
while (mi.hasNext()) {
@ -289,7 +328,7 @@ public class SetTools {
return false;
}
private static boolean anymatchByTest(final HandleSet small, final HandleSet large) {
private final static boolean anymatchByTest(final HandleSet small, final HandleSet large) {
final Iterator<byte[]> mi = small.iterator();
byte[] o;
while (mi.hasNext()) {
@ -299,7 +338,7 @@ public class SetTools {
return false;
}
private static <A> boolean anymatchByEnumeration(final TreeSet<A> set1, final TreeSet<A> set2) {
private final static <A> boolean anymatchByEnumeration(final TreeSet<A> set1, final TreeSet<A> set2) {
// implement pairwise enumeration
final Comparator<? super A> comp = set1.comparator();
final Iterator<A> mi = set1.iterator();
@ -322,7 +361,7 @@ public class SetTools {
return false;
}
private static boolean anymatchByEnumeration(final HandleSet set1, final HandleSet set2) {
private final static boolean anymatchByEnumeration(final HandleSet set1, final HandleSet set2) {
// implement pairwise enumeration
final Comparator<byte[]> comp = set1.comparator();
final Iterator<byte[]> mi = set1.iterator();
@ -370,7 +409,7 @@ public class SetTools {
}
*/
public static <A, B> void excludeDestructive(final Map<A, B> map, final Set<A> set) {
public final static <A, B> void excludeDestructive(final Map<A, B> map, final Set<A> set) {
// comparators must be equal
if (map == null) return;
if (set == null) return;
@ -383,18 +422,18 @@ public class SetTools {
excludeDestructiveByTestSetInMap(map, set);
}
private static <A, B> void excludeDestructiveByTestMapInSet(final Map<A, B> map, final Set<A> set) {
private final static <A, B> void excludeDestructiveByTestMapInSet(final Map<A, B> map, final Set<A> set) {
final Iterator<A> mi = map.keySet().iterator();
while (mi.hasNext()) if (set.contains(mi.next())) mi.remove();
}
private static <A, B> void excludeDestructiveByTestSetInMap(final Map<A, B> map, final Set<A> set) {
private final static <A, B> void excludeDestructiveByTestSetInMap(final Map<A, B> map, final Set<A> set) {
final Iterator<A> si = set.iterator();
while (si.hasNext()) map.remove(si.next());
}
// and the same again with set-set
public static <A> void excludeDestructive(final Set<A> set1, final Set<A> set2) {
public final static <A> void excludeDestructive(final Set<A> set1, final Set<A> set2) {
if (set1 == null) return;
if (set2 == null) return;
assert !(set1 instanceof TreeSet<?> && set2 instanceof TreeSet<?>) || ((TreeSet<A>) set1).comparator() == ((TreeSet<A>) set2).comparator();
@ -406,19 +445,19 @@ public class SetTools {
excludeDestructiveByTestLargeInSmall(set1, set2);
}
private static <A> void excludeDestructiveByTestSmallInLarge(final Set<A> small, final Set<A> large) {
private final static <A> void excludeDestructiveByTestSmallInLarge(final Set<A> small, final Set<A> large) {
final Iterator<A> mi = small.iterator();
while (mi.hasNext()) if (large.contains(mi.next())) mi.remove();
}
private static <A> void excludeDestructiveByTestLargeInSmall(final Set<A> large, final Set<A> small) {
private final static <A> void excludeDestructiveByTestLargeInSmall(final Set<A> large, final Set<A> small) {
final Iterator<A> si = small.iterator();
while (si.hasNext()) large.remove(si.next());
}
// ------------------------------------------------------------------------------------------------
public static TreeMap<String, String> loadMap(final String filename, final String sep) {
public final static TreeMap<String, String> loadMap(final String filename, final String sep) {
final TreeMap<String, String> map = new TreeMap<String, String>();
BufferedReader br = null;
try {
@ -437,7 +476,7 @@ public class SetTools {
return map;
}
public static TreeMap<String, ArrayList<String>> loadMapMultiValsPerKey(final String filename, final String sep) {
public final static TreeMap<String, ArrayList<String>> loadMapMultiValsPerKey(final String filename, final String sep) {
final TreeMap<String, ArrayList<String>> map = new TreeMap<String, ArrayList<String>>();
BufferedReader br = null;
try {
@ -460,7 +499,7 @@ public class SetTools {
return map;
}
public static TreeSet<String> loadList(final File file, final Comparator<String> c) {
public final static TreeSet<String> loadList(final File file, final Comparator<String> c) {
final TreeSet<String> list = new TreeSet<String>(c);
if (!(file.exists())) return list;
@ -480,7 +519,7 @@ public class SetTools {
return list;
}
public static String setToString(final HandleSet set, final char separator) {
public final static String setToString(final HandleSet set, final char separator) {
final Iterator<byte[]> i = set.iterator();
final StringBuilder sb = new StringBuilder(set.size() * 7);
if (i.hasNext()) sb.append(new String(i.next()));
@ -490,7 +529,7 @@ public class SetTools {
return sb.toString();
}
public static String setToString(final Set<String> set, final char separator) {
public final static String setToString(final Set<String> set, final char separator) {
final Iterator<String> i = set.iterator();
final StringBuilder sb = new StringBuilder(set.size() * 7);
if (i.hasNext()) sb.append(i.next());

@ -29,7 +29,6 @@ package net.yacy.repository;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.net.MalformedURLException;
import java.util.Arrays;
@ -39,6 +38,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Document;
import net.yacy.document.TextParser;
import net.yacy.document.ParserException;
@ -57,7 +57,6 @@ import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.crawler.retrieval.SMBLoader;
import de.anomic.http.client.Cache;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseHeader;
@ -98,38 +97,7 @@ public final class LoaderDispatcher {
public HashSet<String> getSupportedProtocols() {
return (HashSet<String>) this.supportedProtocols.clone();
}
/**
* load a resource from the web, from ftp, from smb or a file
* @param url
* @param forText shows that this was a for-text crawling request
* @param global shows that this was a global crawling request
* @param cacheStratgy strategy according to CACHE_STRATEGY_NOCACHE,CACHE_STRATEGY_IFFRESH,CACHE_STRATEGY_IFEXIST,CACHE_STRATEGY_CACHEONLY
* @return the loaded entity in a Response object
* @throws IOException
*/
public Response load(
final DigestURI url,
final boolean forText,
final boolean global,
CrawlProfile.CacheStrategy cacheStratgy,
long maxFileSize) throws IOException {
return load(request(url, forText, global), cacheStratgy, maxFileSize);
}
public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException {
byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent();
if (b == null) throw new IOException("load == null");
File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
// transaction-safe writing
File parent = targetFile.getParentFile();
if (!parent.exists()) parent.mkdirs();
FileUtils.copy(b, tmp);
tmp.renameTo(targetFile);
}
/**
* generate a request object
* @param url the target url
@ -160,7 +128,27 @@ public final class LoaderDispatcher {
0,
0);
}
public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException {
byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent();
if (b == null) throw new IOException("load == null");
File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
// transaction-safe writing
File parent = targetFile.getParentFile();
if (!parent.exists()) parent.mkdirs();
FileUtils.copy(b, tmp);
tmp.renameTo(targetFile);
}
/**
* load a resource from the web, from ftp, from smb or a file
* @param request the request essentials
* @param cacheStratgy strategy according to CACHE_STRATEGY_NOCACHE,CACHE_STRATEGY_IFFRESH,CACHE_STRATEGY_IFEXIST,CACHE_STRATEGY_CACHEONLY
* @return the loaded entity in a Response object
* @throws IOException
*/
public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
// get the protocol of the next URL
final String protocol = request.url().getProtocol();
@ -272,132 +260,40 @@ public final class LoaderDispatcher {
}
/**
* load the url as resource from the web or the cache
* @param url
* @param fetchOnline
* @param socketTimeout
* @param forText
* load the url as byte[] content from the web or the cache
* @param request
* @param cacheStrategy
* @param timeout
* @return the content as {@link byte[]}
* @throws IOException
*/
public byte[] getResource(final DigestURI url, CrawlProfile.CacheStrategy cacheStrategy, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
public byte[] loadContent(final Request request, CrawlProfile.CacheStrategy cacheStrategy) throws IOException {
// try to download the resource using the loader
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final Response entry = load(url, forText, reindexing, cacheStrategy, maxFileSize);
final Response entry = load(request, cacheStrategy, maxFileSize);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
return entry.getContent();
}
/**
* Tries to load and parse a resource specified by it's URL.
* If the resource is not stored in cache and if fetchOnline is set the
* this function tries to download the resource from web.
*
* @param url the URL of the resource
* @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache
* @param timeout
* @param forText
* @param global the domain of the search. If global == true then the content is re-indexed
* @return the parsed document as {@link Document}
*/
public static Document retrieveDocument(final DigestURI url, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final boolean forText, final boolean global, long maxFileSize) {
public Document loadDocument(final Request request, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, long maxFileSize) throws IOException, ParserException {
// load resource
byte[] resContent = null;
ResponseHeader responseHeader = null;
try {
final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, cacheStrategy, maxFileSize);
if (entry == null) {
Log.logFine("snippet fetch", "no Response for url " + url);
return null;
}
final Response response = load(request, cacheStrategy, maxFileSize);
if (response == null) throw new IOException("no Response for url " + request.url());
// read resource body (if it is there)
resContent = entry.getContent();
// read a fresh header
responseHeader = entry.getResponseHeader();
// if it is still not available, report an error
if (resContent == null || responseHeader == null) {
Log.logFine("snippet fetch", "no Content available for url " + url);
return null;
}
} catch (final Exception e) {
Log.logFine("snippet fetch", "error loading resource: " + e.getMessage() + " for url " + url);
return null;
}
// if it is still not available, report an error
if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + request.url());
// parse resource
Document document = null;
try {
document = parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader);
} catch (final ParserException e) {
Log.logFine("snippet fetch", "parser error " + e.getMessage() + " for url " + url);
return null;
} finally {
resContent = null;
}
return document;
}
/**
* Parse the resource
* @param url the URL of the resource
* @param contentLength the contentLength of the resource
* @param resourceStream the resource body as stream
* @param docInfo metadata about the resource
* @return the extracted data
* @throws ParserException
*/
public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream, ResponseHeader responseHeader) throws ParserException {
try {
if (resourceStream == null) return null;
// STEP 1: if no resource metadata is available, try to load it from cache
if (responseHeader == null) {
// try to get the header from the htcache directory
try {
responseHeader = Cache.getResponseHeader(url);
} catch (final Exception e) {
// ignore this. resource info loading failed
}
}
// STEP 2: if the metadata is still null try to download it from web
if ((responseHeader == null) && (url.getProtocol().startsWith("http"))) {
// TODO: we need a better solution here
// e.g. encapsulate this in the crawlLoader class
// getting URL mimeType
try {
responseHeader = Client.whead(url.toString());
} catch (final Exception e) {
// ingore this. http header download failed
}
}
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
String supportError = TextParser.supports(url, responseHeader == null ? null : responseHeader.mime());
if (supportError != null) {
return null;
}
if (responseHeader == null) {
return TextParser.parseSource(url, null, null, contentLength, resourceStream);
}
return TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
} catch (final InterruptedException e) {
// interruption of thread detected
return null;
}
return response.parse();
}
public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException {
public ContentScraper parseResource(final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException {
// load page
final long maxFileSize = loader.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
Response r = loader.load(location, true, false, cachePolicy, maxFileSize);
final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
Response r = this.load(request(location, true, false), cachePolicy, maxFileSize);
byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
@ -409,6 +305,40 @@ public final class LoaderDispatcher {
return scraper;
}
/**
* load all links from a resource
* @param url the url that shall be loaded
* @param cacheStrategy the cache strategy
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
*/
public final Map<MultiProtocolURI, String> loadLinks(DigestURI url, CrawlProfile.CacheStrategy cacheStrategy) throws IOException {
Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE);
if (response == null) throw new IOException("response == null");
ResponseHeader responseHeader = response.getResponseHeader();
byte[] resource = response.getContent();
if (resource == null) throw new IOException("resource == null");
if (responseHeader == null) throw new IOException("responseHeader == null");
Document document = null;
String supportError = TextParser.supports(url, responseHeader.mime());
if (supportError != null) throw new IOException("no parser support: " + supportError);
try {
document = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), resource.length, new ByteArrayInputStream(resource));
if (document == null) throw new IOException("document == null");
} catch (final ParserException e) {
throw new IOException("parser error: " + e.getMessage());
} catch (InterruptedException e) {
throw new IOException("interrupted");
} finally {
resource = null;
}
Map<MultiProtocolURI, String> result = document.getHyperlinks();
document.close();
return result;
}
public synchronized void cleanupAccessTimeTable(long timeout) {
final Iterator<Map.Entry<String, Long>> i = accessTime.entrySet().iterator();
Map.Entry<String, Long> e;
@ -439,7 +369,7 @@ public final class LoaderDispatcher {
if (this.cache.exists()) return;
try {
// load from the net
Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, this.maxFileSize);
Response response = load(request(new DigestURI(this.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, this.maxFileSize);
byte[] b = response.getContent();
FileUtils.copy(b, this.cache);
} catch (MalformedURLException e) {} catch (IOException e) {}

Loading…
Cancel
Save