- can be used to import the crawling queue (noticeUrlDB + stacks) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1518 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
4fa2be73c3
commit
50d85657b8
@ -0,0 +1,212 @@
|
|||||||
|
package de.anomic.plasma.dbImport;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
import de.anomic.plasma.plasmaCrawlNURL;
|
||||||
|
import de.anomic.plasma.plasmaCrawlProfile;
|
||||||
|
import de.anomic.plasma.plasmaSwitchboard;
|
||||||
|
import de.anomic.plasma.plasmaCrawlNURL.Entry;
|
||||||
|
|
||||||
|
public class plasmaCrawlNURLImporter extends AbstractImporter implements
|
||||||
|
dbImporter {
|
||||||
|
|
||||||
|
private HashSet importProfileHandleCache = new HashSet();
|
||||||
|
private plasmaCrawlProfile importProfileDB;
|
||||||
|
private plasmaCrawlNURL importNurlDB;
|
||||||
|
private int importStartSize;
|
||||||
|
private int urlCount = 0;
|
||||||
|
private int profileCount = 0;
|
||||||
|
|
||||||
|
public plasmaCrawlNURLImporter(plasmaSwitchboard theSb) {
|
||||||
|
super(theSb);
|
||||||
|
this.jobType="NURL";
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getEstimatedTime() {
|
||||||
|
return (this.urlCount==0)?0:((this.importStartSize*getElapsedTime())/(this.urlCount))-getElapsedTime();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getJobName() {
|
||||||
|
return this.importPath.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getProcessingStatusPercent() {
|
||||||
|
return (this.urlCount)/((this.importStartSize<100)?1:(this.importStartSize)/100);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getStatus() {
|
||||||
|
StringBuffer theStatus = new StringBuffer();
|
||||||
|
|
||||||
|
theStatus.append("#URLs=").append(this.urlCount).append("\n");
|
||||||
|
theStatus.append("#Profiles=").append(this.profileCount);
|
||||||
|
|
||||||
|
return theStatus.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void init(File theImportPath, int theCacheSize) {
|
||||||
|
super.init(theImportPath);
|
||||||
|
this.cacheSize = theCacheSize;
|
||||||
|
|
||||||
|
File noticeUrlDbFile = new File(this.importPath,"urlNotice1.db");
|
||||||
|
File profileDbFile = new File(this.importPath, "crawlProfiles0.db");
|
||||||
|
|
||||||
|
String errorMsg = null;
|
||||||
|
if (!this.importPath.exists())
|
||||||
|
errorMsg = "The import path '" + this.importPath + "' does not exist.";
|
||||||
|
else if (!this.importPath.isDirectory())
|
||||||
|
errorMsg = "The import path '" + this.importPath + "' is not a directory.";
|
||||||
|
else if (!this.importPath.canRead())
|
||||||
|
errorMsg = "The import path '" + this.importPath + "' is not readable.";
|
||||||
|
else if (!this.importPath.canWrite())
|
||||||
|
errorMsg = "The import path '" + this.importPath + "' is not writeable.";
|
||||||
|
|
||||||
|
else if (!noticeUrlDbFile.exists())
|
||||||
|
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist.";
|
||||||
|
else if (noticeUrlDbFile.isDirectory())
|
||||||
|
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not a file.";
|
||||||
|
else if (!noticeUrlDbFile.canRead())
|
||||||
|
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not readable.";
|
||||||
|
else if (!noticeUrlDbFile.canWrite())
|
||||||
|
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' is not writeable.";
|
||||||
|
|
||||||
|
else if (!profileDbFile.exists())
|
||||||
|
errorMsg = "The profileDB file '" + profileDbFile + "' does not exist.";
|
||||||
|
else if (profileDbFile.isDirectory())
|
||||||
|
errorMsg = "The profileDB file '" + profileDbFile + "' is not a file.";
|
||||||
|
else if (!profileDbFile.canRead())
|
||||||
|
errorMsg = "The profileDB file '" + profileDbFile + "' is not readable.";
|
||||||
|
// else if (!profileDbFile.canWrite())
|
||||||
|
// errorMsg = "The profileDB file '" + profileDbFile + "' is not writeable.";
|
||||||
|
|
||||||
|
if (errorMsg != null) {
|
||||||
|
this.log.logSevere(errorMsg);
|
||||||
|
throw new IllegalArgumentException(errorMsg);
|
||||||
|
}
|
||||||
|
|
||||||
|
// init noticeUrlDB
|
||||||
|
this.log.logInfo("Initializing the source noticeUrlDB");
|
||||||
|
this.importNurlDB = new plasmaCrawlNURL(this.importPath, this.cacheSize*(3/4));
|
||||||
|
this.importStartSize = this.importNurlDB.size();
|
||||||
|
int stackSize = this.importNurlDB.stackSize();
|
||||||
|
|
||||||
|
// init profile DB
|
||||||
|
this.log.logInfo("Initializing the source profileDB");
|
||||||
|
this.importProfileDB = new plasmaCrawlProfile(profileDbFile,this.cacheSize*(1/3));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void run() {
|
||||||
|
try {
|
||||||
|
// waiting on init thread to finish
|
||||||
|
this.importNurlDB.waitOnInitThread();
|
||||||
|
|
||||||
|
// the stack types we want to import
|
||||||
|
int[] stackTypes = new int[] {plasmaCrawlNURL.STACK_TYPE_CORE,
|
||||||
|
plasmaCrawlNURL.STACK_TYPE_LIMIT,
|
||||||
|
plasmaCrawlNURL.STACK_TYPE_REMOTE,
|
||||||
|
-1};
|
||||||
|
|
||||||
|
// looping through the various stacks
|
||||||
|
for (int i=0; i< stackTypes.length; i++) {
|
||||||
|
if (stackTypes[i] != -1) {
|
||||||
|
this.log.logInfo("Starting to import stacktype '" + stackTypes[i] + "' containing '" + this.importNurlDB.stackSize(stackTypes[i]) + "' entries.");
|
||||||
|
} else {
|
||||||
|
this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// getting an interator and loop through the URL entries
|
||||||
|
Iterator iter = (stackTypes[i] == -1)?this.importNurlDB.urlHashes("------------", true):null;
|
||||||
|
while (true) {
|
||||||
|
|
||||||
|
String nextHash = null;
|
||||||
|
Entry urlEntry = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (stackTypes[i] != -1) {
|
||||||
|
if (this.importNurlDB.stackSize(stackTypes[i]) == 0) break;
|
||||||
|
|
||||||
|
this.urlCount++;
|
||||||
|
urlEntry = this.importNurlDB.pop(stackTypes[i]);
|
||||||
|
nextHash = urlEntry.hash();
|
||||||
|
} else {
|
||||||
|
if (!iter.hasNext()) break;
|
||||||
|
|
||||||
|
this.urlCount++;
|
||||||
|
nextHash = (String)iter.next();
|
||||||
|
urlEntry = this.importNurlDB.getEntry(nextHash);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
this.log.logWarning("Unable to import entry: " + e.toString());
|
||||||
|
|
||||||
|
if ((stackTypes[i] != -1) &&(this.importNurlDB.stackSize(stackTypes[i]) == 0)) break;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// getting a handler to the crawling profile the url belongs to
|
||||||
|
try {
|
||||||
|
String profileHandle = urlEntry.profileHandle();
|
||||||
|
if (profileHandle == null) {
|
||||||
|
this.log.logWarning("Profile handle of url entry '" + nextHash + "' unknown.");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we havn't imported the profile until yet we need to do it now
|
||||||
|
if (!this.importProfileHandleCache.contains(profileHandle)) {
|
||||||
|
|
||||||
|
// testing if the profile is already known
|
||||||
|
plasmaCrawlProfile.entry profileEntry = this.sb.profiles.getEntry(profileHandle);
|
||||||
|
|
||||||
|
// if not we need to import it
|
||||||
|
if (profileEntry == null) {
|
||||||
|
// copy and store the source profile entry into the destination db
|
||||||
|
plasmaCrawlProfile.entry sourceEntry = this.importProfileDB.getEntry(profileHandle);
|
||||||
|
if (sourceEntry != null) {
|
||||||
|
this.profileCount++;
|
||||||
|
this.importProfileHandleCache.add(profileHandle);
|
||||||
|
this.sb.profiles.newEntry((TreeMap)((TreeMap)sourceEntry.map()).clone());
|
||||||
|
} else {
|
||||||
|
this.log.logWarning("Profile '" + profileHandle + "' of url entry '" + nextHash + "' unknown.");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if the url does not alredy exists in the destination stack we insert it now
|
||||||
|
if (!this.sb.urlPool.noticeURL.existsInStack(nextHash)) {
|
||||||
|
this.sb.urlPool.noticeURL.newEntry(urlEntry,(stackTypes[i] != -1)?stackTypes[i]:plasmaCrawlNURL.STACK_TYPE_CORE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// removing hash from the import db
|
||||||
|
} finally {
|
||||||
|
this.importNurlDB.remove(nextHash);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.urlCount % 100 == 0) {
|
||||||
|
this.log.logFine(this.urlCount + " URLs and '" + this.profileCount + "' profile entries processed so far.");
|
||||||
|
}
|
||||||
|
if (this.isAborted()) break;
|
||||||
|
}
|
||||||
|
this.log.logInfo("Finished to import stacktype '" + stackTypes[i] + "'");
|
||||||
|
}
|
||||||
|
|
||||||
|
int size = this.importNurlDB.size();
|
||||||
|
int stackSize = this.importNurlDB.stackSize();
|
||||||
|
|
||||||
|
// TODO: what todo with nurlDB entries that do not exist in any stack?
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
this.error = e.toString();
|
||||||
|
this.log.logSevere("Import process had detected an error",e);
|
||||||
|
} finally {
|
||||||
|
this.log.logInfo("Import process finished.");
|
||||||
|
this.globalEnd = System.currentTimeMillis();
|
||||||
|
this.sb.dbImportManager.finishedJobs.add(this);
|
||||||
|
this.importNurlDB.close();
|
||||||
|
this.importProfileDB.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in new issue