fixed urls to media content during indexing

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8021 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 13 years ago
parent 0d858d48ec
commit 85d6bf4ac4

@ -53,7 +53,7 @@ public final class ResultURLs {
// 4) proxy-load (initiator is "------------") // 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash) // 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknown initiator) // 6) local fetching for global crawling (other known or unknown initiator)
UNKNOWN(0), UNKNOWN(0),
REMOTE_RECEIPTS(1), REMOTE_RECEIPTS(1),
QUERIES(2), QUERIES(2),
@ -62,31 +62,31 @@ public final class ResultURLs {
LOCAL_CRAWLING(5), LOCAL_CRAWLING(5),
GLOBAL_CRAWLING(6), GLOBAL_CRAWLING(6),
SURROGATES(7); SURROGATES(7);
protected int code; protected int code;
private static final EventOrigin[] list = { private static final EventOrigin[] list = {
UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING, SURROGATES}; UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING, SURROGATES};
private EventOrigin(int code) { private EventOrigin(final int code) {
this.code = code; this.code = code;
} }
public int getCode() { public int getCode() {
return this.code; return this.code;
} }
public static final EventOrigin getEvent(int key) { public static final EventOrigin getEvent(final int key) {
return list[key]; return list[key];
} }
} }
private final static Map<EventOrigin, Map<String, InitExecEntry>> resultStacks = new ConcurrentHashMap<EventOrigin, Map<String, InitExecEntry>>(); // a mapping from urlHash to Entries private final static Map<EventOrigin, Map<String, InitExecEntry>> resultStacks = new ConcurrentHashMap<EventOrigin, Map<String, InitExecEntry>>(); // a mapping from urlHash to Entries
private final static Map<EventOrigin, ScoreMap<String>> resultDomains = new ConcurrentHashMap<EventOrigin, ScoreMap<String>>(); private final static Map<EventOrigin, ScoreMap<String>> resultDomains = new ConcurrentHashMap<EventOrigin, ScoreMap<String>>();
static { static {
for (EventOrigin origin: EventOrigin.values()) { for (final EventOrigin origin: EventOrigin.values()) {
resultStacks.put(origin, new LinkedHashMap<String, InitExecEntry>()); resultStacks.put(origin, new LinkedHashMap<String, InitExecEntry>());
resultDomains.put(origin, new ClusteredScoreMap<String>()); resultDomains.put(origin, new ClusteredScoreMap<String>());
} }
} }
public static class InitExecEntry { public static class InitExecEntry {
public byte[] initiatorHash, executorHash; public byte[] initiatorHash, executorHash;
public InitExecEntry(final byte[] initiatorHash, final byte[] executorHash) { public InitExecEntry(final byte[] initiatorHash, final byte[] executorHash) {
@ -122,19 +122,19 @@ public final class ResultURLs {
return; return;
} }
} }
public static int getStackSize(final EventOrigin stack) { public static int getStackSize(final EventOrigin stack) {
final Map<String, InitExecEntry> resultStack = getStack(stack); final Map<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack == null) return 0; if (resultStack == null) return 0;
return resultStack.size(); return resultStack.size();
} }
public static int getDomainListSize(final EventOrigin stack) { public static int getDomainListSize(final EventOrigin stack) {
final ScoreMap<String> domains = getDomains(stack); final ScoreMap<String> domains = getDomains(stack);
if (domains == null) return 0; if (domains == null) return 0;
return domains.size(); return domains.size();
} }
public static Iterator<Map.Entry<String, InitExecEntry>> results(final EventOrigin stack) { public static Iterator<Map.Entry<String, InitExecEntry>> results(final EventOrigin stack) {
final Map<String, InitExecEntry> resultStack = getStack(stack); final Map<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack == null) return new LinkedHashMap<String, InitExecEntry>().entrySet().iterator(); if (resultStack == null) return new LinkedHashMap<String, InitExecEntry>().entrySet().iterator();
@ -149,8 +149,8 @@ public final class ResultURLs {
assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).keys(false); return getDomains(stack).keys(false);
} }
public static int deleteDomain(final EventOrigin stack, String host, String hosthash) { public static int deleteDomain(final EventOrigin stack, final String host, final String hosthash) {
assert host != null : "host = null"; assert host != null : "host = null";
assert hosthash.length() == 6; assert hosthash.length() == 6;
final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack); final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack);
@ -164,22 +164,22 @@ public final class ResultURLs {
assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).delete(host); return getDomains(stack).delete(host);
} }
/** /**
* return the count of the domain * return the count of the domain
* @param stack type * @param stack type
* @param domain name * @param domain name
* @return the number of occurrences of the domain in the stack statistics * @return the number of occurrences of the domain in the stack statistics
*/ */
public static int domainCount(final EventOrigin stack, String domain) { public static int domainCount(final EventOrigin stack, final String domain) {
assert domain != null : "domain = null"; assert domain != null : "domain = null";
assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).get(domain); return getDomains(stack).get(domain);
} }
/** /**
* returns the stack identified by the id <em>stack</em> * returns the stack identified by the id <em>stack</em>
* *
* @param stack id of resultStack * @param stack id of resultStack
* @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged)) * @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged))
*/ */
@ -191,9 +191,9 @@ public final class ResultURLs {
} }
public static void clearStacks() { public static void clearStacks() {
for (EventOrigin origin: EventOrigin.values()) clearStack(origin); for (final EventOrigin origin: EventOrigin.values()) clearStack(origin);
} }
public static void clearStack(final EventOrigin stack) { public static void clearStack(final EventOrigin stack) {
final Map<String, InitExecEntry> resultStack = getStack(stack); final Map<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack != null) resultStack.clear(); if (resultStack != null) resultStack.clear();
@ -208,13 +208,13 @@ public final class ResultURLs {
public static boolean remove(final String urlHash) { public static boolean remove(final String urlHash) {
if (urlHash == null) return false; if (urlHash == null) return false;
Map<String, InitExecEntry> resultStack; Map<String, InitExecEntry> resultStack;
for (EventOrigin origin: EventOrigin.values()) { for (final EventOrigin origin: EventOrigin.values()) {
resultStack = getStack(origin); resultStack = getStack(origin);
if (resultStack != null) resultStack.remove(urlHash); if (resultStack != null) resultStack.remove(urlHash);
} }
return true; return true;
} }
/** /**
* test and benchmark * test and benchmark
* @param args * @param args
@ -223,7 +223,7 @@ public final class ResultURLs {
try { try {
final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/"); final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/");
final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0f, 0.0f, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0); final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0f, 0.0f, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0);
EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING; final EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
System.out.println("valid test:\n======="); System.out.println("valid test:\n=======");
// add // add
stack(urlRef, urlRef.hash(), url.hash(), stackNo); stack(urlRef, urlRef.hash(), url.hash(), stackNo);

@ -456,23 +456,23 @@ public final class TextParser {
final ArrayList<Document> docs = new ArrayList<Document>(); final ArrayList<Document> docs = new ArrayList<Document>();
docs.add(document); docs.add(document);
for (final Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) { for (final Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
docs.add(genLinkDocs(docs, "application", link.getKey(), link.getValue(), document.getContentLanguages())); docs.add(genLinkDocs("application", link.getKey(), link.getValue(), document.getContentLanguages()));
} }
for (final Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) { for (final Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
docs.add(genLinkDocs(docs, "audio", link.getKey(), link.getValue(), document.getContentLanguages())); docs.add(genLinkDocs("audio", link.getKey(), link.getValue(), document.getContentLanguages()));
} }
for (final Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) { for (final Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
docs.add(genLinkDocs(docs, "video", link.getKey(), link.getValue(), document.getContentLanguages())); docs.add(genLinkDocs("video", link.getKey(), link.getValue(), document.getContentLanguages()));
} }
for (final Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) { for (final Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
docs.add(genImageDocs(docs, link.getValue())); docs.add(genImageDocs(link.getValue()));
} }
// finally return the list of documents // finally return the list of documents
return docs.toArray(new Document[docs.size()]); return docs.toArray(new Document[docs.size()]);
} }
private final static Document genLinkDocs(final ArrayList<Document> docs, final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) { private final static Document genLinkDocs(final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr); //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
return new Document( return new Document(
uri, uri,
@ -494,7 +494,7 @@ public final class TextParser {
false); false);
} }
private final static Document genImageDocs(final ArrayList<Document> docs, final ImageEntry img) { private final static Document genImageDocs(final ImageEntry img) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt()); //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
return new Document( return new Document(
img.url(), img.url(),

@ -2087,8 +2087,11 @@ public final class Switchboard extends serverSwitch {
private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent, final String sourceName) { private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent, final String sourceName) {
//TODO: document must carry referer, size and last modified
// CREATE INDEX // CREATE INDEX
final String dc_title = document.dc_title(); final String dc_title = document.dc_title();
final DigestURI url = new DigestURI(document.dc_source());
final DigestURI referrerURL = queueEntry.referrerURL(); final DigestURI referrerURL = queueEntry.referrerURL();
EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash);
if (process == Segments.Process.SURROGATES) { if (process == Segments.Process.SURROGATES) {
@ -2097,24 +2100,24 @@ public final class Switchboard extends serverSwitch {
if (condenser == null || document.indexingDenied()) { if (condenser == null || document.indexingDenied()) {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase); //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase); addURLtoErrorDB(url, (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase);
return; return;
} }
if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) { if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); addURLtoErrorDB(url, (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
return; return;
} }
// remove stopwords // remove stopwords
this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url()); this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url);
// STORE WORD INDEX // STORE WORD INDEX
URIMetadataRow newEntry = null; URIMetadataRow newEntry = null;
try { try {
newEntry = this.indexSegments.segment(process).storeDocument( newEntry = this.indexSegments.segment(process).storeDocument(
queueEntry.url(), url,
referrerURL, referrerURL,
queueEntry.lastModified(), queueEntry.lastModified(),
new Date(), new Date(),
@ -2127,14 +2130,14 @@ public final class Switchboard extends serverSwitch {
feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false))); feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false)));
} catch (final IOException e) { } catch (final IOException e) {
//if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase); //if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "error storing url: " + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase + ", error = " + e.getMessage()); addURLtoErrorDB(url, (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "error storing url: " + url.toNormalform(false, true) + "': process case=" + processCase + ", error = " + e.getMessage());
return; return;
} }
// store rss feeds in document into rss table // store rss feeds in document into rss table
for (final Map.Entry<MultiProtocolURI, String> rssEntry : document.getRSS().entrySet()) { for (final Map.Entry<MultiProtocolURI, String> rssEntry : document.getRSS().entrySet()) {
final Tables.Data rssRow = new Tables.Data(); final Tables.Data rssRow = new Tables.Data();
rssRow.put("referrer", queueEntry.url().hash()); rssRow.put("referrer", url.hash());
rssRow.put("url", UTF8.getBytes(rssEntry.getKey().toNormalform(true, false))); rssRow.put("url", UTF8.getBytes(rssEntry.getKey().toNormalform(true, false)));
rssRow.put("title", UTF8.getBytes(rssEntry.getValue())); rssRow.put("title", UTF8.getBytes(rssEntry.getValue()));
rssRow.put("recording_date", new Date()); rssRow.put("recording_date", new Date());
@ -2163,7 +2166,7 @@ public final class Switchboard extends serverSwitch {
EventTracker.update(EventTracker.EClass.PPM, Long.valueOf(currentPPM()), true); EventTracker.update(EventTracker.EClass.PPM, Long.valueOf(currentPPM()), true);
lastPPMUpdate = System.currentTimeMillis(); lastPPMUpdate = System.currentTimeMillis();
} }
EventTracker.update(EventTracker.EClass.INDEX, queueEntry.url().toNormalform(true, false), false); EventTracker.update(EventTracker.EClass.INDEX, url.toNormalform(true, false), false);
// if this was performed for a remote crawl request, notify requester // if this was performed for a remote crawl request, notify requester
if ((processCase == EventOrigin.GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) { if ((processCase == EventOrigin.GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) {

Loading…
Cancel
Save