yacy_search_server/source/de/anomic/crawler/ExternalIndexImporter.java

package de.anomic.crawler;

import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;

import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.DateFormatter;

import de.anomic.search.Segment;

public class ExternalIndexImporter extends AbstractImporter implements Importer {
	
	/**
	 * the source word index (the DB to import)
	 */
    private final Segment importWordIndex;
    
    /**
     * the destination word index (the home DB)
     */
    protected Segment homeWordIndex;
    private final int importStartSize;   

    private byte[] wordHash = "------------".getBytes();
    
    long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = this.wordChunkStart;
    byte[] wordChunkStartHash = "------------".getBytes(), wordChunkEndHash;
    private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0;
    

    public ExternalIndexImporter(final Segment homeWI, final Segment importWI) {
    	super("PLASMADB");
        this.homeWordIndex = homeWI;
        this.importWordIndex = importWI;
        this.importStartSize = this.importWordIndex.termIndex().sizesMax();
    }

    /**
     * @see Importer#getJobName()
     */
    public String getJobName() {
        return this.importWordIndex.getLocation().toString();
    }

    /**
     * @see Importer#getStatus()
     */
    public String getStatus() {
        final StringBuilder theStatus = new StringBuilder();
        
        theStatus.append("Hash=").append(this.wordHash).append("\n");
        theStatus.append("#URL=").append(this.urlCounter).append("\n");
        theStatus.append("#Word Entity=").append(this.wordCounter).append("\n");
        theStatus.append("#Word Entry={").append(this.entryCounter);
        theStatus.append(" ,NotBound=").append(this.notBoundEntryCounter).append("}");
        
        return theStatus.toString();
    }
    
    public void run() {
        try {
            importWordsDB();
        } finally {
            this.globalEnd = System.currentTimeMillis();
            //this.sb.dbImportManager.finishedJobs.add(this);
        }
    }

    /**
     * @see Importer#getProcessingStatusPercent()
     */
    public int getProcessingStatusPercent() {
        // thid seems to be better:
        // (this.importStartSize-this.importWordIndex.size())*100/((this.importStartSize==0)?1:this.importStartSize);
        // but maxint (2,147,483,647) could be exceeded when WordIndexes reach 20M entries
        //return (this.importStartSize-this.importWordIndex.size())/((this.importStartSize<100)?1:(this.importStartSize)/100);
        return (int)(this.wordCounter)/((this.importStartSize<100)?1:(this.importStartSize)/100);
    }

    /**
     * @see Importer#getElapsedTime()
     */
    public long getEstimatedTime() {
        return (this.wordCounter==0)?0:((this.importStartSize*getElapsedTime())/this.wordCounter)-getElapsedTime();
    }
    
    public void importWordsDB() {
        this.log.logInfo("STARTING DB-IMPORT");  
        
        try {
            this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation().getAbsolutePath() + "'");
            this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
            this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");                        
            
            final HashSet<String> unknownUrlBuffer = new HashSet<String>();
            final HashSet<String> importedUrlBuffer = new HashSet<String>();
			
            // iterate over all words from import db
            //Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, CrawlSwitchboard.RL_WORDFILES, false);
            Iterator<ReferenceContainer<WordReference>> indexContainerIterator = this.importWordIndex.termIndex().references(this.wordChunkStartHash, false, 100, false).iterator();
            while (!isAborted() && indexContainerIterator.hasNext()) {
                
                final TreeSet<String> entityUrls = new TreeSet<String>();
                ReferenceContainer<WordReference> newContainer = null;
                try {
                    this.wordCounter++;
                    newContainer = indexContainerIterator.next();
                    this.wordHash = newContainer.getTermHash();
                    
                    // loop throug the entities of the container and get the
                    // urlhash
                    final Iterator<WordReference> importWordIdxEntries = newContainer.entries();
                    Reference importWordIdxEntry;
                    while (importWordIdxEntries.hasNext()) {
                        // testing if import process was aborted
                        if (isAborted()) break;

                        // getting next word index entry
                        importWordIdxEntry = importWordIdxEntries.next();
                        final String urlHash = importWordIdxEntry.metadataHash();
                        entityUrls.add(urlHash);
                    }

                    final Iterator<String> urlIter = entityUrls.iterator();
                    while (urlIter.hasNext()) {	
                        if (isAborted()) break;
                        final String urlHash = urlIter.next();

                        if (!importedUrlBuffer.contains(urlHash)) {
                            if (unknownUrlBuffer.contains(urlHash)) {
                                // url known as unknown
                                unknownUrlBuffer.add(urlHash);
                                notBoundEntryCounter++;
                                newContainer.remove(urlHash);
                                continue;
                            }
                            // we need to import the url

                            // getting the url entry
                            final URIMetadataRow urlEntry = this.importWordIndex.urlMetadata().load(urlHash, null, 0);
                            if (urlEntry != null) {

                                /* write it into the home url db */
                                homeWordIndex.urlMetadata().store(urlEntry);
                                importedUrlBuffer.add(urlHash);
                                this.urlCounter++;

                                if (this.urlCounter % 500 == 0) {
                                    this.log.logFine(this.urlCounter + " URLs processed so far.");
                                }

                            } else {
                                unknownUrlBuffer.add(urlHash);
                                notBoundEntryCounter++;
                                newContainer.remove(urlHash);
                                continue;
                            }
                        //} else {
                            // already known url
                        }
                        this.entryCounter++;
                    }
					
                    // testing if import process was aborted
                    if (isAborted()) break;
                    
                    // importing entity container to home db
                    if (newContainer.size() > 0) { homeWordIndex.termIndex().add(newContainer); }
                    
                    // delete complete index entity file
                    this.importWordIndex.termIndex().delete(this.wordHash);                 
                    
                    // print out some statistical information
                    if (this.entryCounter % 500 == 0) {
                        this.log.logFine(this.entryCounter + " word entries and " + this.wordCounter + " word entities processed so far.");
                    }

                    if (this.wordCounter%500 == 0) {
                        this.wordChunkEndHash = this.wordHash;
                        this.wordChunkEnd = System.currentTimeMillis();
                        final long duration = this.wordChunkEnd - this.wordChunkStart;
                        this.log.logInfo(this.wordCounter + " word entities imported " +
                                "[" + this.wordChunkStartHash + " .. " + this.wordChunkEndHash + "] " +
                                this.getProcessingStatusPercent() + "%\n" + 
                                "Speed: "+ 500*1000/duration + " word entities/s" +
                                " | Elapsed time: " + DateFormatter.formatInterval(getElapsedTime()) +
                                " | Estimated time: " + DateFormatter.formatInterval(getEstimatedTime()) + "\n" + 
                                "Home Words = " + homeWordIndex.termIndex().sizesMax() + 
                                " | Import Words = " + this.importWordIndex.termIndex().sizesMax());
                        this.wordChunkStart = this.wordChunkEnd;
                        this.wordChunkStartHash = this.wordChunkEndHash;
                    }                    
                    
                } catch (final Exception e) {
                    this.log.logSevere("Import of word entity '" + this.wordHash + "' failed.",e);
                } finally {
                    if (newContainer != null) newContainer.clear();
                }

                if (!indexContainerIterator.hasNext()) {
                    // We may not be finished yet, try to get the next chunk of wordHashes
                    final TreeSet<ReferenceContainer<WordReference>> containers = this.importWordIndex.termIndex().references(this.wordHash, false, 100, false);
                    indexContainerIterator = containers.iterator();
                    // Make sure we don't get the same wordhash twice, but don't skip a word
                    if ((indexContainerIterator.hasNext())&&(!this.wordHash.equals((indexContainerIterator.next()).getTermHash()))) {
                        indexContainerIterator = containers.iterator();
                    }
                }
            }
            
            this.log.logInfo("Home word index contains " + homeWordIndex.termIndex().sizesMax() + " words and " + homeWordIndex.urlMetadata().size() + " URLs.");
            this.log.logInfo("Import word index contains " + this.importWordIndex.termIndex().sizesMax() + " words and " + this.importWordIndex.urlMetadata().size() + " URLs.");
        } catch (final Exception e) {
            this.log.logSevere("Database import failed.",e);
            Log.logException(e);
            this.error = e.toString();
        } finally {
            this.log.logInfo("Import process finished.");
            if (this.importWordIndex != null) try { this.importWordIndex.close(); } catch (final Exception e){}
        }
    }    

}