next refactoring step in document indexing to prepare concurrency environment for document parsing

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4604 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 7f9f639d20
commit 9b0e20fb06

@ -54,7 +54,7 @@ import de.anomic.yacy.yacyURL;
public final class indexRepositoryReference { public final class indexRepositoryReference {
// class objects // class objects
private kelondroIndex urlIndexFile; kelondroIndex urlIndexFile;
private Export exportthread = null; // will habe a export thread assigned if exporter is running private Export exportthread = null; // will habe a export thread assigned if exporter is running
public indexRepositoryReference(File indexSecondaryRoot, String networkName) { public indexRepositoryReference(File indexSecondaryRoot, String networkName) {

@ -54,6 +54,7 @@ import de.anomic.server.serverFileUtils;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
import java.util.Arrays; import java.util.Arrays;
import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
@ -86,6 +87,7 @@ public class plasmaParserDocument {
private yacyURL favicon; private yacyURL favicon;
private boolean resorted; private boolean resorted;
private InputStream textStream; private InputStream textStream;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
protected plasmaParserDocument(yacyURL location, String mimeType, String charset, protected plasmaParserDocument(yacyURL location, String mimeType, String charset,
String[] keywords, String title, String author, String[] keywords, String title, String author,
@ -107,6 +109,8 @@ public class plasmaParserDocument {
this.applinks = null; this.applinks = null;
this.emaillinks = null; this.emaillinks = null;
this.resorted = false; this.resorted = false;
this.inboundLinks = -1;
this.outboundLinks = -1;
if (text == null) try { if (text == null) try {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE); this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
@ -430,6 +434,22 @@ dc_rights
this.favicon = faviconURL; this.favicon = faviconURL;
} }
public void notifyWebStructure(plasmaWebStructure webStructure, plasmaCondenser condenser, Date docDate) {
Integer[] ioLinks = webStructure.generateCitationReference(this, condenser, docDate); // [outlinksSame, outlinksOther]
this.inboundLinks = ioLinks[0].intValue();
this.outboundLinks = ioLinks[1].intValue();
}
public int inboundLinks() {
assert this.inboundLinks >= 0;
return (this.inboundLinks < 0) ? 0 : this.inboundLinks;
}
public int outboundLinks() {
assert this.outboundLinks >= 0;
return (this.outboundLinks < 0) ? 0 : this.outboundLinks;
}
public void close() { public void close() {
// try close the output stream // try close the output stream
if (this.textStream != null) { if (this.textStream != null) {

@ -1862,7 +1862,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
if (document != null) { if (document != null) {
plasmaCondenser condensement = condenseDocument(nextentry, document); plasmaCondenser condensement = condenseDocument(nextentry, document);
if (condensement != null) { if (condensement != null) {
indexDocument(nextentry, document, condensement); document.notifyWebStructure(webStructure, condensement, nextentry.getModificationDate());
storeDocumentIndex(nextentry, document, condensement);
} }
} }
return true; return true;
@ -2228,8 +2229,58 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
return condenser; return condenser;
} }
private void storeDocumentIndex(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) {
// CREATE INDEX
String dc_title = document.dc_title();
yacyURL referrerURL = entry.referrerURL();
int processCase = entry.processCase();
// remove stopwords
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url());
// STORE URL TO LOADED-URL-DB
indexURLReference newEntry = null;
try {
newEntry = wordIndex.storeDocument(entry, document, condenser);
} catch (IOException e) {
log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, "error storing url: " + e.getMessage(), new kelondroBitfield());
return;
}
// update statistics
crawlResults.stack(
newEntry, // loaded url db entry
entry.initiator(), // initiator peer hash
yacyCore.seedDB.mySeed().hash, // executor peer hash
processCase // process case
);
// STORE WORD INDEX
if ((!entry.profile().indexText()) && (!entry.profile().indexMedia())) {
log.logFine("Not Indexed Resource '" + entry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerURL.hash(), entry.initiator(), dc_title, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
return;
}
// increment number of indexed urls
indexedPages++;
// update profiling info
plasmaProfiling.updateIndexedPage(entry);
// if this was performed for a remote crawl request, notify requester
yacySeed initiatorPeer = entry.initiatorPeer();
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
log.logInfo("Sending crawl receipt for '" + entry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
if (clusterhashes != null) initiatorPeer.setAlternativeAddress((String) clusterhashes.get(initiatorPeer.hash));
yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
}
}
/*
private void indexDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) throws InterruptedException { private void indexDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) throws InterruptedException {
long indexingStartTime = 0, indexingEndTime = 0, long indexingStartTime = System.currentTimeMillis(), indexingEndTime = 0,
storageStartTime = 0, storageEndTime = 0; storageStartTime = 0, storageEndTime = 0;
// CREATE INDEX // CREATE INDEX
@ -2348,6 +2399,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, ""); yacyClient.crawlReceipt(initiatorPeer, "crawl", "fill", "indexed", newEntry, "");
} }
} }
*/
private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy"); private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy");
public static String dateString(Date date) { public static String dateString(Date date) {

@ -92,8 +92,8 @@ public class plasmaWebStructure {
} }
} }
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(yacyURL url, String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) { public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(plasmaParserDocument document, plasmaCondenser condenser, Date docDate) {
assert url.hash().equals(baseurlhash); yacyURL url = document.dc_source();
// generate citation reference // generate citation reference
Map<yacyURL, String> hl = document.getHyperlinks(); Map<yacyURL, String> hl = document.getHyperlinks();
@ -101,7 +101,7 @@ public class plasmaWebStructure {
String nexturlhash; String nexturlhash;
StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1); StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1);
StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1); StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1);
String lhp = baseurlhash.substring(6); // local hash part String lhp = url.hash().substring(6); // local hash part
int GCount = 0; int GCount = 0;
int LCount = 0; int LCount = 0;
while (it.hasNext()) { while (it.hasNext()) {
@ -121,7 +121,7 @@ public class plasmaWebStructure {
// append this reference to buffer // append this reference to buffer
// generate header info // generate header info
String head = baseurlhash + "=" + String head = url.hash() + "=" +
plasmaWordIndex.microDateHoursStr(docDate.getTime()) + // latest update timestamp of the URL plasmaWordIndex.microDateHoursStr(docDate.getTime()) + // latest update timestamp of the URL
plasmaWordIndex.microDateHoursStr(System.currentTimeMillis()) + // last visit timestamp of the URL plasmaWordIndex.microDateHoursStr(System.currentTimeMillis()) + // last visit timestamp of the URL
kelondroBase64Order.enhancedCoder.encodeLongSmart(LCount, 2) + // count of links to local resources kelondroBase64Order.enhancedCoder.encodeLongSmart(LCount, 2) + // count of links to local resources

@ -75,9 +75,11 @@ public final class plasmaWordIndex implements indexRI {
private final indexRAMRI dhtOutCache, dhtInCache; private final indexRAMRI dhtOutCache, dhtInCache;
private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster
private int flushsize; private int flushsize;
private final indexRepositoryReference referenceURL; private serverLog log;
final indexRepositoryReference referenceURL;
public plasmaWordIndex(File indexPrimaryRoot, File indexSecondaryRoot, String networkName, serverLog log) { public plasmaWordIndex(File indexPrimaryRoot, File indexSecondaryRoot, String networkName, serverLog log) {
this.log = log;
File indexPrimaryPath = new File(indexPrimaryRoot, networkName); File indexPrimaryPath = new File(indexPrimaryRoot, networkName);
File indexPrimaryTextLocation = new File(indexPrimaryPath, "TEXT"); File indexPrimaryTextLocation = new File(indexPrimaryPath, "TEXT");
if (!indexPrimaryTextLocation.exists()) { if (!indexPrimaryTextLocation.exists()) {
@ -603,6 +605,77 @@ public final class plasmaWordIndex implements indexRI {
return containers; // this may return less containers as demanded return containers; // this may return less containers as demanded
} }
public indexURLReference storeDocument(plasmaSwitchboardQueue.Entry entry, plasmaParserDocument document, plasmaCondenser condenser) throws IOException {
long startTime = System.currentTimeMillis();
// CREATE INDEX
String dc_title = document.dc_title();
yacyURL referrerURL = entry.referrerURL();
Date docDate = entry.getModificationDate();
// create a new loaded URL db entry
long ldate = System.currentTimeMillis();
indexURLReference newEntry = new indexURLReference(
entry.url(), // URL
dc_title, // document description
document.dc_creator(), // author
document.dc_subject(' '), // tags
"", // ETag
docDate, // modification date
new Date(), // loaded date
new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula
(referrerURL == null) ? null : referrerURL.hash(), // referer hash
new byte[0], // md5
(int) entry.size(), // size
condenser.RESULT_NUMB_WORDS, // word count
plasmaHTCache.docType(document.dc_format()), // doctype
condenser.RESULT_FLAGS, // flags
yacyURL.language(entry.url()), // language
document.inboundLinks(), // inbound links
document.outboundLinks(), // outbound links
document.getAudiolinks().size(), // laudio
document.getImages().size(), // limage
document.getVideolinks().size(), // lvideo
document.getApplinks().size() // lapp
);
// STORE URL TO LOADED-URL-DB
putURL(newEntry);
long storageEndTime = System.currentTimeMillis();
// STORE PAGE INDEX INTO WORD INDEX DB
int words = addPageIndex(
entry.url(), // document url
docDate, // document mod date
(int) entry.size(), // document size
document, // document content
condenser, // document condenser
yacyURL.language(entry.url()), // document language
plasmaHTCache.docType(document.dc_format()), // document type
document.inboundLinks(), // inbound links
document.outboundLinks() // outbound links
);
long indexingEndTime = System.currentTimeMillis();
if (log.isInfo()) {
// TODO: UTF-8 docDescription seems not to be displayed correctly because
// of string concatenation
log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
" [" + entry.urlHash() + "]" +
"\n\tDescription: " + dc_title +
"\n\tMimeType: " + document.dc_format() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.getTextLength() + " bytes | " +
"Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
"\n\tLinkStorageTime: " + (storageEndTime - startTime) + " ms | " +
"indexStorageTime: " + (indexingEndTime - storageEndTime) + " ms");
}
// finished
return newEntry;
}
public synchronized kelondroCloneableIterator<indexContainer> wordContainers(String startHash, boolean ram, boolean rot) { public synchronized kelondroCloneableIterator<indexContainer> wordContainers(String startHash, boolean ram, boolean rot) {
kelondroCloneableIterator<indexContainer> i = wordContainers(startHash, ram); kelondroCloneableIterator<indexContainer> i = wordContainers(startHash, ram);
if (rot) { if (rot) {

Loading…
Cancel
Save