fixed urls to media content during indexing

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8021 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 13 years ago
parent 0d858d48ec
commit 85d6bf4ac4

@ -53,7 +53,7 @@ public final class ResultURLs {
// 4) proxy-load (initiator is "------------")
// 5) local prefetch/crawling (initiator is own seedHash)
// 6) local fetching for global crawling (other known or unknown initiator)
UNKNOWN(0),
REMOTE_RECEIPTS(1),
QUERIES(2),
@ -62,31 +62,31 @@ public final class ResultURLs {
LOCAL_CRAWLING(5),
GLOBAL_CRAWLING(6),
SURROGATES(7);
protected int code;
private static final EventOrigin[] list = {
UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING, SURROGATES};
private EventOrigin(int code) {
private EventOrigin(final int code) {
this.code = code;
}
public int getCode() {
return this.code;
}
public static final EventOrigin getEvent(int key) {
public static final EventOrigin getEvent(final int key) {
return list[key];
}
}
private final static Map<EventOrigin, Map<String, InitExecEntry>> resultStacks = new ConcurrentHashMap<EventOrigin, Map<String, InitExecEntry>>(); // a mapping from urlHash to Entries
private final static Map<EventOrigin, ScoreMap<String>> resultDomains = new ConcurrentHashMap<EventOrigin, ScoreMap<String>>();
static {
for (EventOrigin origin: EventOrigin.values()) {
for (final EventOrigin origin: EventOrigin.values()) {
resultStacks.put(origin, new LinkedHashMap<String, InitExecEntry>());
resultDomains.put(origin, new ClusteredScoreMap<String>());
}
}
public static class InitExecEntry {
public byte[] initiatorHash, executorHash;
public InitExecEntry(final byte[] initiatorHash, final byte[] executorHash) {
@ -122,19 +122,19 @@ public final class ResultURLs {
return;
}
}
public static int getStackSize(final EventOrigin stack) {
final Map<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack == null) return 0;
return resultStack.size();
}
public static int getDomainListSize(final EventOrigin stack) {
final ScoreMap<String> domains = getDomains(stack);
if (domains == null) return 0;
return domains.size();
}
public static Iterator<Map.Entry<String, InitExecEntry>> results(final EventOrigin stack) {
final Map<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack == null) return new LinkedHashMap<String, InitExecEntry>().entrySet().iterator();
@ -149,8 +149,8 @@ public final class ResultURLs {
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).keys(false);
}
public static int deleteDomain(final EventOrigin stack, String host, String hosthash) {
public static int deleteDomain(final EventOrigin stack, final String host, final String hosthash) {
assert host != null : "host = null";
assert hosthash.length() == 6;
final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack);
@ -164,22 +164,22 @@ public final class ResultURLs {
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).delete(host);
}
/**
* return the count of the domain
* @param stack type
* @param domain name
* @return the number of occurrences of the domain in the stack statistics
*/
public static int domainCount(final EventOrigin stack, String domain) {
public static int domainCount(final EventOrigin stack, final String domain) {
assert domain != null : "domain = null";
assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).get(domain);
}
/**
* returns the stack identified by the id <em>stack</em>
*
*
* @param stack id of resultStack
* @return null if stack does not exist (id is unknown or stack is null (which should not occur and an error is logged))
*/
@ -191,9 +191,9 @@ public final class ResultURLs {
}
public static void clearStacks() {
for (EventOrigin origin: EventOrigin.values()) clearStack(origin);
for (final EventOrigin origin: EventOrigin.values()) clearStack(origin);
}
public static void clearStack(final EventOrigin stack) {
final Map<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack != null) resultStack.clear();
@ -208,13 +208,13 @@ public final class ResultURLs {
public static boolean remove(final String urlHash) {
if (urlHash == null) return false;
Map<String, InitExecEntry> resultStack;
for (EventOrigin origin: EventOrigin.values()) {
for (final EventOrigin origin: EventOrigin.values()) {
resultStack = getStack(origin);
if (resultStack != null) resultStack.remove(urlHash);
}
return true;
}
/**
* test and benchmark
* @param args
@ -223,7 +223,7 @@ public final class ResultURLs {
try {
final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/");
final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0f, 0.0f, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0);
EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
final EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
System.out.println("valid test:\n=======");
// add
stack(urlRef, urlRef.hash(), url.hash(), stackNo);

@ -456,23 +456,23 @@ public final class TextParser {
final ArrayList<Document> docs = new ArrayList<Document>();
docs.add(document);
for (final Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
docs.add(genLinkDocs(docs, "application", link.getKey(), link.getValue(), document.getContentLanguages()));
docs.add(genLinkDocs("application", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
docs.add(genLinkDocs(docs, "audio", link.getKey(), link.getValue(), document.getContentLanguages()));
docs.add(genLinkDocs("audio", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
docs.add(genLinkDocs(docs, "video", link.getKey(), link.getValue(), document.getContentLanguages()));
docs.add(genLinkDocs("video", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
docs.add(genImageDocs(docs, link.getValue()));
docs.add(genImageDocs(link.getValue()));
}
// finally return the list of documents
return docs.toArray(new Document[docs.size()]);
}
private final static Document genLinkDocs(final ArrayList<Document> docs, final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) {
private final static Document genLinkDocs(final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
return new Document(
uri,
@ -494,7 +494,7 @@ public final class TextParser {
false);
}
private final static Document genImageDocs(final ArrayList<Document> docs, final ImageEntry img) {
private final static Document genImageDocs(final ImageEntry img) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
return new Document(
img.url(),

@ -2087,8 +2087,11 @@ public final class Switchboard extends serverSwitch {
private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent, final String sourceName) {
//TODO: document must carry referer, size and last modified
// CREATE INDEX
final String dc_title = document.dc_title();
final DigestURI url = new DigestURI(document.dc_source());
final DigestURI referrerURL = queueEntry.referrerURL();
EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash);
if (process == Segments.Process.SURROGATES) {
@ -2097,24 +2100,24 @@ public final class Switchboard extends serverSwitch {
if (condenser == null || document.indexingDenied()) {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase);
addURLtoErrorDB(url, (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase);
return;
}
if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
addURLtoErrorDB(url, (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
return;
}
// remove stopwords
this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());
this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url);
// STORE WORD INDEX
URIMetadataRow newEntry = null;
try {
newEntry = this.indexSegments.segment(process).storeDocument(
queueEntry.url(),
url,
referrerURL,
queueEntry.lastModified(),
new Date(),
@ -2127,14 +2130,14 @@ public final class Switchboard extends serverSwitch {
feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false)));
} catch (final IOException e) {
//if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "error storing url: " + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase + ", error = " + e.getMessage());
addURLtoErrorDB(url, (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "error storing url: " + url.toNormalform(false, true) + "': process case=" + processCase + ", error = " + e.getMessage());
return;
}
// store rss feeds in document into rss table
for (final Map.Entry<MultiProtocolURI, String> rssEntry : document.getRSS().entrySet()) {
final Tables.Data rssRow = new Tables.Data();
rssRow.put("referrer", queueEntry.url().hash());
rssRow.put("referrer", url.hash());
rssRow.put("url", UTF8.getBytes(rssEntry.getKey().toNormalform(true, false)));
rssRow.put("title", UTF8.getBytes(rssEntry.getValue()));
rssRow.put("recording_date", new Date());
@ -2163,7 +2166,7 @@ public final class Switchboard extends serverSwitch {
EventTracker.update(EventTracker.EClass.PPM, Long.valueOf(currentPPM()), true);
lastPPMUpdate = System.currentTimeMillis();
}
EventTracker.update(EventTracker.EClass.INDEX, queueEntry.url().toNormalform(true, false), false);
EventTracker.update(EventTracker.EClass.INDEX, url.toNormalform(true, false), false);
// if this was performed for a remote crawl request, notify requester
if ((processCase == EventOrigin.GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) {

Loading…
Cancel
Save