fixed urls to media content during indexing

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8021 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 0d858d48ec
commit 85d6bf4ac4

@ -66,13 +66,13 @@ public final class ResultURLs {
protected int code; protected int code;
private static final EventOrigin[] list = { private static final EventOrigin[] list = {
UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING, SURROGATES}; UNKNOWN, REMOTE_RECEIPTS, QUERIES, DHT_TRANSFER, PROXY_LOAD, LOCAL_CRAWLING, GLOBAL_CRAWLING, SURROGATES};
private EventOrigin(int code) { private EventOrigin(final int code) {
this.code = code; this.code = code;
} }
public int getCode() { public int getCode() {
return this.code; return this.code;
} }
public static final EventOrigin getEvent(int key) { public static final EventOrigin getEvent(final int key) {
return list[key]; return list[key];
} }
} }
@ -81,7 +81,7 @@ public final class ResultURLs {
private final static Map<EventOrigin, ScoreMap<String>> resultDomains = new ConcurrentHashMap<EventOrigin, ScoreMap<String>>(); private final static Map<EventOrigin, ScoreMap<String>> resultDomains = new ConcurrentHashMap<EventOrigin, ScoreMap<String>>();
static { static {
for (EventOrigin origin: EventOrigin.values()) { for (final EventOrigin origin: EventOrigin.values()) {
resultStacks.put(origin, new LinkedHashMap<String, InitExecEntry>()); resultStacks.put(origin, new LinkedHashMap<String, InitExecEntry>());
resultDomains.put(origin, new ClusteredScoreMap<String>()); resultDomains.put(origin, new ClusteredScoreMap<String>());
} }
@ -150,7 +150,7 @@ public final class ResultURLs {
return getDomains(stack).keys(false); return getDomains(stack).keys(false);
} }
public static int deleteDomain(final EventOrigin stack, String host, String hosthash) { public static int deleteDomain(final EventOrigin stack, final String host, final String hosthash) {
assert host != null : "host = null"; assert host != null : "host = null";
assert hosthash.length() == 6; assert hosthash.length() == 6;
final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack); final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack);
@ -171,7 +171,7 @@ public final class ResultURLs {
* @param domain name * @param domain name
* @return the number of occurrences of the domain in the stack statistics * @return the number of occurrences of the domain in the stack statistics
*/ */
public static int domainCount(final EventOrigin stack, String domain) { public static int domainCount(final EventOrigin stack, final String domain) {
assert domain != null : "domain = null"; assert domain != null : "domain = null";
assert getDomains(stack) != null : "getDomains(" + stack + ") = null"; assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
return getDomains(stack).get(domain); return getDomains(stack).get(domain);
@ -191,7 +191,7 @@ public final class ResultURLs {
} }
public static void clearStacks() { public static void clearStacks() {
for (EventOrigin origin: EventOrigin.values()) clearStack(origin); for (final EventOrigin origin: EventOrigin.values()) clearStack(origin);
} }
public static void clearStack(final EventOrigin stack) { public static void clearStack(final EventOrigin stack) {
@ -208,7 +208,7 @@ public final class ResultURLs {
public static boolean remove(final String urlHash) { public static boolean remove(final String urlHash) {
if (urlHash == null) return false; if (urlHash == null) return false;
Map<String, InitExecEntry> resultStack; Map<String, InitExecEntry> resultStack;
for (EventOrigin origin: EventOrigin.values()) { for (final EventOrigin origin: EventOrigin.values()) {
resultStack = getStack(origin); resultStack = getStack(origin);
if (resultStack != null) resultStack.remove(urlHash); if (resultStack != null) resultStack.remove(urlHash);
} }
@ -223,7 +223,7 @@ public final class ResultURLs {
try { try {
final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/"); final DigestURI url = new DigestURI("http", "www.yacy.net", 80, "/");
final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0f, 0.0f, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0); final URIMetadataRow urlRef = new URIMetadataRow(url, "YaCy Homepage", "", "", "", 0.0f, 0.0f, new Date(), new Date(), new Date(), "", new byte[] {}, 123, 42, '?', new Bitfield(), UTF8.getBytes("de"), 0, 0, 0, 0, 0, 0);
EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING; final EventOrigin stackNo = EventOrigin.LOCAL_CRAWLING;
System.out.println("valid test:\n======="); System.out.println("valid test:\n=======");
// add // add
stack(urlRef, urlRef.hash(), url.hash(), stackNo); stack(urlRef, urlRef.hash(), url.hash(), stackNo);

@ -456,23 +456,23 @@ public final class TextParser {
final ArrayList<Document> docs = new ArrayList<Document>(); final ArrayList<Document> docs = new ArrayList<Document>();
docs.add(document); docs.add(document);
for (final Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) { for (final Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
docs.add(genLinkDocs(docs, "application", link.getKey(), link.getValue(), document.getContentLanguages())); docs.add(genLinkDocs("application", link.getKey(), link.getValue(), document.getContentLanguages()));
} }
for (final Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) { for (final Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
docs.add(genLinkDocs(docs, "audio", link.getKey(), link.getValue(), document.getContentLanguages())); docs.add(genLinkDocs("audio", link.getKey(), link.getValue(), document.getContentLanguages()));
} }
for (final Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) { for (final Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
docs.add(genLinkDocs(docs, "video", link.getKey(), link.getValue(), document.getContentLanguages())); docs.add(genLinkDocs("video", link.getKey(), link.getValue(), document.getContentLanguages()));
} }
for (final Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) { for (final Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
docs.add(genImageDocs(docs, link.getValue())); docs.add(genImageDocs(link.getValue()));
} }
// finally return the list of documents // finally return the list of documents
return docs.toArray(new Document[docs.size()]); return docs.toArray(new Document[docs.size()]);
} }
private final static Document genLinkDocs(final ArrayList<Document> docs, final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) { private final static Document genLinkDocs(final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr); //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
return new Document( return new Document(
uri, uri,
@ -494,7 +494,7 @@ public final class TextParser {
false); false);
} }
private final static Document genImageDocs(final ArrayList<Document> docs, final ImageEntry img) { private final static Document genImageDocs(final ImageEntry img) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt()); //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
return new Document( return new Document(
img.url(), img.url(),

@ -2087,8 +2087,11 @@ public final class Switchboard extends serverSwitch {
private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent, final String sourceName) { private void storeDocumentIndex(final Segments.Process process, final Response queueEntry, final Document document, final Condenser condenser, final SearchEvent searchEvent, final String sourceName) {
//TODO: document must carry referer, size and last modified
// CREATE INDEX // CREATE INDEX
final String dc_title = document.dc_title(); final String dc_title = document.dc_title();
final DigestURI url = new DigestURI(document.dc_source());
final DigestURI referrerURL = queueEntry.referrerURL(); final DigestURI referrerURL = queueEntry.referrerURL();
EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash);
if (process == Segments.Process.SURROGATES) { if (process == Segments.Process.SURROGATES) {
@ -2097,24 +2100,24 @@ public final class Switchboard extends serverSwitch {
if (condenser == null || document.indexingDenied()) { if (condenser == null || document.indexingDenied()) {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase); //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase); addURLtoErrorDB(url, (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase);
return; return;
} }
if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) { if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); addURLtoErrorDB(url, (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
return; return;
} }
// remove stopwords // remove stopwords
this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url()); this.log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url);
// STORE WORD INDEX // STORE WORD INDEX
URIMetadataRow newEntry = null; URIMetadataRow newEntry = null;
try { try {
newEntry = this.indexSegments.segment(process).storeDocument( newEntry = this.indexSegments.segment(process).storeDocument(
queueEntry.url(), url,
referrerURL, referrerURL,
queueEntry.lastModified(), queueEntry.lastModified(),
new Date(), new Date(),
@ -2127,14 +2130,14 @@ public final class Switchboard extends serverSwitch {
feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false))); feed.addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false)));
} catch (final IOException e) { } catch (final IOException e) {
//if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase); //if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "error storing url: " + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase + ", error = " + e.getMessage()); addURLtoErrorDB(url, (referrerURL == null) ? null : referrerURL.hash(), queueEntry.initiator(), dc_title, FailCategory.FINAL_LOAD_CONTEXT, "error storing url: " + url.toNormalform(false, true) + "': process case=" + processCase + ", error = " + e.getMessage());
return; return;
} }
// store rss feeds in document into rss table // store rss feeds in document into rss table
for (final Map.Entry<MultiProtocolURI, String> rssEntry : document.getRSS().entrySet()) { for (final Map.Entry<MultiProtocolURI, String> rssEntry : document.getRSS().entrySet()) {
final Tables.Data rssRow = new Tables.Data(); final Tables.Data rssRow = new Tables.Data();
rssRow.put("referrer", queueEntry.url().hash()); rssRow.put("referrer", url.hash());
rssRow.put("url", UTF8.getBytes(rssEntry.getKey().toNormalform(true, false))); rssRow.put("url", UTF8.getBytes(rssEntry.getKey().toNormalform(true, false)));
rssRow.put("title", UTF8.getBytes(rssEntry.getValue())); rssRow.put("title", UTF8.getBytes(rssEntry.getValue()));
rssRow.put("recording_date", new Date()); rssRow.put("recording_date", new Date());
@ -2163,7 +2166,7 @@ public final class Switchboard extends serverSwitch {
EventTracker.update(EventTracker.EClass.PPM, Long.valueOf(currentPPM()), true); EventTracker.update(EventTracker.EClass.PPM, Long.valueOf(currentPPM()), true);
lastPPMUpdate = System.currentTimeMillis(); lastPPMUpdate = System.currentTimeMillis();
} }
EventTracker.update(EventTracker.EClass.INDEX, queueEntry.url().toNormalform(true, false), false); EventTracker.update(EventTracker.EClass.INDEX, url.toNormalform(true, false), false);
// if this was performed for a remote crawl request, notify requester // if this was performed for a remote crawl request, notify requester
if ((processCase == EventOrigin.GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) { if ((processCase == EventOrigin.GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) {

Loading…
Cancel
Save