From 40dd6ec4fd933d50878af7dbe3c8a2f9a96334a2 Mon Sep 17 00:00:00 2001 From: theli Date: Thu, 16 Feb 2006 13:07:01 +0000 Subject: [PATCH] *) experimental restructuring of db import function - trying to reduce IO load by avoiding unnecessary db access - trying to presort url list git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1671 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../plasma/dbImport/plasmaDbImporter.java | 85 +++++++++++++------ 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 93c4fe225..a51db1baf 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -2,7 +2,11 @@ package de.anomic.plasma.dbImport; import java.io.File; import java.io.IOException; +import java.util.HashSet; import java.util.Iterator; +import java.util.TreeSet; + +import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; @@ -17,14 +21,13 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { private plasmaCrawlLURL importUrlDB; private plasmaWordIndex importWordIndex; - private int importStartSize; - + private int importStartSize; private String wordHash = "------------"; long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = this.wordChunkStart; String wordChunkStartHash = "------------", wordChunkEndHash; - private long urlCounter = 0, wordCounter = 0, entryCounter = 0; + private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0; public plasmaDbImporter(plasmaSwitchboard theSb) { @@ -41,8 +44,9 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { theStatus.append("Hash=").append(this.wordHash).append("\n"); theStatus.append("#URL=").append(this.urlCounter).append("\n"); - theStatus.append("#Word Entities=").append(this.wordCounter).append("\n"); - theStatus.append("#Word Entries=").append(this.entryCounter); + theStatus.append("#Word Entity=").append(this.wordCounter).append("\n"); + theStatus.append("#Word Entry={").append(this.entryCounter); + theStatus.append(" ,NotBound=").append(this.notBoundEntryCounter).append("}"); return theStatus.toString(); } @@ -109,11 +113,14 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs."); this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importUrlDB.size() + " URLs."); + HashSet unknownUrlBuffer = new HashSet(); + HashSet importedUrlBuffer = new HashSet(); + // iterate over all words from import db - Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); while (!isAborted() && importWordHashIterator.hasNext()) { + TreeSet entityUrls = new TreeSet(new kelondroNaturalOrder(true)); plasmaWordIndexEntryContainer newContainer = null; try { this.wordCounter++; @@ -122,44 +129,72 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { if (newContainer.size() == 0) continue; - // the combined container will fit, read the container + // loop throug the entities of the container and get the + // urlhash Iterator importWordIdxEntries = newContainer.entries(); plasmaWordIndexEntry importWordIdxEntry; while (importWordIdxEntries.hasNext()) { - // testing if import process was aborted if (isAborted()) break; // getting next word index entry - this.entryCounter++; importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next(); - String urlHash = importWordIdxEntry.getUrlHash(); - if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) try { - // importing the new url - plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, importWordIdxEntry); - this.urlCounter++; - this.homeUrlDB.newEntry(urlEntry); - - if (this.urlCounter % 500 == 0) { - this.log.logFine(this.urlCounter + " URLs processed so far."); - } - } catch (IOException e) {} - - if (this.entryCounter % 500 == 0) { - this.log.logFine(this.entryCounter + " word entries and " + this.wordCounter + " word entities processed so far."); - } + String urlHash = importWordIdxEntry.getUrlHash(); + entityUrls.add(urlHash); } - + + Iterator urlIter = entityUrls.iterator(); + while (urlIter.hasNext()) { + if (isAborted()) break; + String urlHash = (String) urlIter.next(); + + if (importedUrlBuffer.contains(urlHash)) { + // already known url + } else if (unknownUrlBuffer.contains(urlHash)) { + // url known as unknown + unknownUrlBuffer.add(urlHash); + notBoundEntryCounter++; + newContainer.remove(urlHash); + continue; + } else { + // we need to import the url + try { + // getting the url entry + plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash, null); + + /* write it into the home url db */ + this.homeUrlDB.newEntry(urlEntry); + importedUrlBuffer.add(urlHash); + this.urlCounter++; + + if (this.urlCounter % 500 == 0) { + this.log.logFine(this.urlCounter + " URLs processed so far."); + } + } catch (IOException e) { + unknownUrlBuffer.add(urlHash); + notBoundEntryCounter++; + newContainer.remove(urlHash); + continue; + } + } + this.entryCounter++; + } + // testing if import process was aborted if (isAborted()) break; // importing entity container to home db + if (newContainer.size() == 0) continue; this.homeWordIndex.addEntries(newContainer, System.currentTimeMillis(), false); // delete complete index entity file this.importWordIndex.deleteIndex(this.wordHash); // print out some statistical information + if (this.entryCounter % 500 == 0) { + this.log.logFine(this.entryCounter + " word entries and " + this.wordCounter + " word entities processed so far."); + } + if (this.wordCounter%500 == 0) { this.wordChunkEndHash = this.wordHash; this.wordChunkEnd = System.currentTimeMillis();