|
|
|
@ -77,7 +77,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
|
|
|
|
|
this.log.logFine("Initializing source word index db.");
|
|
|
|
|
this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2, this.log, sb.getConfigBool("useCollectionIndex", false));
|
|
|
|
|
this.log.logFine("Initializing import URL db.");
|
|
|
|
|
this.importUrlDB = new plasmaCrawlLURL(new File(this.importPath, "urlHash.db"), (this.cacheSize/2)/1024, preloadTime / 2, false);
|
|
|
|
|
this.importUrlDB = new plasmaCrawlLURL(this.importPath, (this.cacheSize/2)/1024, preloadTime / 2, false);
|
|
|
|
|
this.importStartSize = this.importWordIndex.size();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -154,9 +154,10 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
|
|
|
|
|
continue;
|
|
|
|
|
} else {
|
|
|
|
|
// we need to import the url
|
|
|
|
|
try {
|
|
|
|
|
// getting the url entry
|
|
|
|
|
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.load(urlHash, null);
|
|
|
|
|
|
|
|
|
|
// getting the url entry
|
|
|
|
|
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.load(urlHash, null);
|
|
|
|
|
if (urlEntry != null) {
|
|
|
|
|
|
|
|
|
|
/* write it into the home url db */
|
|
|
|
|
this.homeUrlDB.store(urlEntry, false);
|
|
|
|
@ -166,7 +167,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
|
|
|
|
|
if (this.urlCounter % 500 == 0) {
|
|
|
|
|
this.log.logFine(this.urlCounter + " URLs processed so far.");
|
|
|
|
|
}
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
unknownUrlBuffer.add(urlHash);
|
|
|
|
|
notBoundEntryCounter++;
|
|
|
|
|
newContainer.remove(urlHash);
|
|
|
|
|