remove TABs from plasmaDbImporter.java

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1916 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
hermens 19 years ago
parent 2b31f51896
commit 474379ae63

@ -46,7 +46,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
theStatus.append("#URL=").append(this.urlCounter).append("\n"); theStatus.append("#URL=").append(this.urlCounter).append("\n");
theStatus.append("#Word Entity=").append(this.wordCounter).append("\n"); theStatus.append("#Word Entity=").append(this.wordCounter).append("\n");
theStatus.append("#Word Entry={").append(this.entryCounter); theStatus.append("#Word Entry={").append(this.entryCounter);
theStatus.append(" ,NotBound=").append(this.notBoundEntryCounter).append("}"); theStatus.append(" ,NotBound=").append(this.notBoundEntryCounter).append("}");
return theStatus.toString(); return theStatus.toString();
} }
@ -90,8 +90,6 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
} }
} }
public int getProcessingStatusPercent() { public int getProcessingStatusPercent() {
// thid seems to be better: // thid seems to be better:
// (this.importStartSize-this.importWordIndex.size())*100/((this.importStartSize==0)?1:this.importStartSize); // (this.importStartSize-this.importWordIndex.size())*100/((this.importStartSize==0)?1:this.importStartSize);
@ -100,7 +98,6 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
return (int)(this.wordCounter)/((this.importStartSize<100)?1:(this.importStartSize)/100); return (int)(this.wordCounter)/((this.importStartSize<100)?1:(this.importStartSize)/100);
} }
public long getEstimatedTime() { public long getEstimatedTime() {
return (this.wordCounter==0)?0:((this.importStartSize*getElapsedTime())/this.wordCounter)-getElapsedTime(); return (this.wordCounter==0)?0:((this.importStartSize*getElapsedTime())/this.wordCounter)-getElapsedTime();
} }
@ -113,14 +110,14 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs."); this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs."); this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs.");
HashSet unknownUrlBuffer = new HashSet(); HashSet unknownUrlBuffer = new HashSet();
HashSet importedUrlBuffer = new HashSet(); HashSet importedUrlBuffer = new HashSet();
// iterate over all words from import db // iterate over all words from import db
Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
while (!isAborted() && importWordHashIterator.hasNext()) { while (!isAborted() && importWordHashIterator.hasNext()) {
TreeSet entityUrls = new TreeSet(new kelondroNaturalOrder(true)); TreeSet entityUrls = new TreeSet(new kelondroNaturalOrder(true));
plasmaWordIndexEntryContainer newContainer = null; plasmaWordIndexEntryContainer newContainer = null;
try { try {
this.wordCounter++; this.wordCounter++;
@ -130,7 +127,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
if (newContainer.size() == 0) continue; if (newContainer.size() == 0) continue;
// loop throug the entities of the container and get the // loop throug the entities of the container and get the
// urlhash // urlhash
Iterator importWordIdxEntries = newContainer.entries(); Iterator importWordIdxEntries = newContainer.entries();
plasmaWordIndexEntry importWordIdxEntry; plasmaWordIndexEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) { while (importWordIdxEntries.hasNext()) {
@ -140,45 +137,45 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
// getting next word index entry // getting next word index entry
importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next(); importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
String urlHash = importWordIdxEntry.getUrlHash(); String urlHash = importWordIdxEntry.getUrlHash();
entityUrls.add(urlHash); entityUrls.add(urlHash);
} }
Iterator urlIter = entityUrls.iterator(); Iterator urlIter = entityUrls.iterator();
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
if (isAborted()) break; if (isAborted()) break;
String urlHash = (String) urlIter.next(); String urlHash = (String) urlIter.next();
if (importedUrlBuffer.contains(urlHash)) { if (importedUrlBuffer.contains(urlHash)) {
// already known url // already known url
} else if (unknownUrlBuffer.contains(urlHash)) { } else if (unknownUrlBuffer.contains(urlHash)) {
// url known as unknown // url known as unknown
unknownUrlBuffer.add(urlHash); unknownUrlBuffer.add(urlHash);
notBoundEntryCounter++; notBoundEntryCounter++;
newContainer.remove(urlHash); newContainer.remove(urlHash);
continue; continue;
} else { } else {
// we need to import the url // we need to import the url
try { try {
// getting the url entry // getting the url entry
plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, null); plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, null);
/* write it into the home url db */ /* write it into the home url db */
plasmaCrawlLURL.Entry homeEntry = this.homeUrlDB.newEntry(urlEntry); plasmaCrawlLURL.Entry homeEntry = this.homeUrlDB.newEntry(urlEntry);
homeEntry.store(); homeEntry.store();
importedUrlBuffer.add(urlHash); importedUrlBuffer.add(urlHash);
this.urlCounter++; this.urlCounter++;
if (this.urlCounter % 500 == 0) { if (this.urlCounter % 500 == 0) {
this.log.logFine(this.urlCounter + " URLs processed so far."); this.log.logFine(this.urlCounter + " URLs processed so far.");
} }
} catch (IOException e) { } catch (IOException e) {
unknownUrlBuffer.add(urlHash); unknownUrlBuffer.add(urlHash);
notBoundEntryCounter++; notBoundEntryCounter++;
newContainer.remove(urlHash); newContainer.remove(urlHash);
continue; continue;
} }
} }
this.entryCounter++; this.entryCounter++;
} }
// testing if import process was aborted // testing if import process was aborted

Loading…
Cancel
Save