refactoring:

moved importer classes to crawler and plasma package

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4770 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent ee81ff4ef4
commit fbb712c669

@ -24,14 +24,7 @@
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableCellLight">
<td>Import&nbsp;Type:</td>
<td title="the path to the database that should be imported">
<select name="importType" size="1">
<!-- Options not availible because of missing support for Assortment DB's
<option value="plasmaDB">PLASMA DB Import</option>
<option value="assortment">Assortment File Import</option>-->
<option value="NURL">Crawling Queue Import</option>
</select>
</td>
<td title="the path to the database that should be imported"></td>
<td title="the cache size that should be used for the import db">Cache Size</td>
<td>
<select name="cacheSize" size="1">

@ -51,9 +51,10 @@
import java.io.PrintStream;
import java.util.Date;
import de.anomic.crawler.NoticeURLImporter;
import de.anomic.crawler.Importer;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.dbImport.dbImporter;
import de.anomic.server.serverByteBuffer;
import de.anomic.server.serverDate;
import de.anomic.server.serverObjects;
@ -71,27 +72,12 @@ public final class IndexImport_p {
if (post != null) {
if (post.containsKey("startIndexDbImport")) {
try {
String importType = (String) post.get("importType");
int cacheSize = post.getInt("cacheSize", 0);
boolean startImport = true;
// // check if there is an already running thread with the same import path
// Thread[] importThreads = new Thread[plasmaDbImporter.runningJobs.activeCount()*2];
// activeCount = plasmaDbImporter.runningJobs.enumerate(importThreads);
//
// for (int i=0; i < activeCount; i++) {
// plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i];
// if (currThread.getJobName().equals(new File(importPath))) {
// prop.put("error",2);
// startImport = false;
// }
// }
//
boolean startImport = true;
if (startImport) {
dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
Importer importerThread = new NoticeURLImporter(switchboard.plasmaPath, switchboard.crawlQueues, switchboard.profilesActiveCrawls, switchboard.dbImportManager);
if (importerThread != null) {
importerThread.init(switchboard, cacheSize);
importerThread.setJobID(switchboard.dbImportManager.generateUniqueJobID());
importerThread.startIt();
}
prop.put("LOCATION","");
@ -119,7 +105,7 @@ public final class IndexImport_p {
) {
// getting the job nr of the thread
String jobID = (String) post.get("jobNr");
dbImporter importer = switchboard.dbImportManager.getImporterByID(Integer.valueOf(jobID).intValue());
Importer importer = switchboard.dbImportManager.getImporterByID(Integer.valueOf(jobID).intValue());
if (importer != null) {
if (post.containsKey("stopIndexDbImport")) {
try {
@ -145,11 +131,11 @@ public final class IndexImport_p {
/*
* Loop over all currently running jobs
*/
dbImporter[] importThreads = switchboard.dbImportManager.getRunningImporter();
Importer[] importThreads = switchboard.dbImportManager.getRunningImporter();
activeCount = importThreads.length;
for (int i=0; i < activeCount; i++) {
dbImporter currThread = importThreads[i];
Importer currThread = importThreads[i];
// get import type
prop.put("running.jobs_" + i + "_type", currThread.getJobType());
@ -183,9 +169,9 @@ public final class IndexImport_p {
/*
* Loop over all finished jobs
*/
dbImporter[] finishedJobs = switchboard.dbImportManager.getFinishedImporter();
Importer[] finishedJobs = switchboard.dbImportManager.getFinishedImporter();
for (int i=0; i<finishedJobs.length; i++) {
dbImporter currThread = finishedJobs[i];
Importer currThread = finishedJobs[i];
String error = currThread.getError();
String fullName = currThread.getJobName().toString();
String shortName = (fullName.length()>30)?fullName.substring(0,12) + "..." + fullName.substring(fullName.length()-22,fullName.length()):fullName;

@ -37,12 +37,12 @@ import java.util.regex.PatternSyntaxException;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.SitemapImporter;
import de.anomic.crawler.ZURL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.dbImport.SitemapImporter;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -340,10 +340,9 @@ public class WatchCrawler_p {
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
// create a new sitemap importer
SitemapImporter importerThread = (SitemapImporter) sb.dbImportManager.getNewImporter("sitemap");
SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new yacyURL(sitemapURLStr, null), pe);
if (importerThread != null) {
importerThread.init(sb, 0);
importerThread.initSitemap(new yacyURL(sitemapURLStr, null), pe);
importerThread.setJobID(sb.dbImportManager.generateUniqueJobID());
importerThread.startIt();
}
} catch (Exception e) {

@ -1,45 +1,30 @@
package de.anomic.plasma.dbImport;
package de.anomic.crawler;
import java.util.HashMap;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.logging.serverLog;
public abstract class AbstractImporter extends Thread implements dbImporter{
public abstract class AbstractImporter extends Thread implements Importer {
protected int jobID = -1;
protected String jobType;
protected serverLog log;
protected boolean stopped = false;
protected boolean paused = false;
protected int cacheSize;
protected long globalStart = System.currentTimeMillis();
protected long globalEnd;
protected long globalPauseLast;
protected long globalPauseDuration;
protected String error;
protected plasmaSwitchboard sb;
AbstractImporter(String theJobType, plasmaSwitchboard switchboard) {
super(switchboard.dbImportManager.runningJobs,"");
public AbstractImporter(String theJobType) {
this.jobType = theJobType;
this.sb = switchboard;
// initializing the logger and setting a more verbose thread name
this.log = new serverLog("IMPORT_" + this.jobType + "_" + this.jobID);
this.setName("IMPORT_" + this.jobType + "_" + this.jobID);
}
public String getError() {
return this.error;
}
/**
* @see dbImporter#init(HashMap)
*/
public void init() {
// initializing the logger and setting a more verbose thread name
this.log = new serverLog("IMPORT_" + this.jobType + "_" + this.jobID);
this.setName("IMPORT_" + this.jobType + "_" + this.jobID);
}
public void startIt() {

@ -1,8 +1,6 @@
package de.anomic.plasma.dbImport;
package de.anomic.crawler;
import de.anomic.plasma.plasmaSwitchboard;
public interface dbImporter {
public interface Importer {
// functions to pause and continue importing
public boolean isPaused();
@ -23,6 +21,5 @@ public interface dbImporter {
public String getJobType();
public String getError();
public String getStatus();
public void init(plasmaSwitchboard switchboard, int cacheSize) throws ImporterException;
public void startIt();
}

@ -1,4 +1,4 @@
package de.anomic.plasma.dbImport;
package de.anomic.crawler;
public class ImporterException extends Exception {

@ -1,22 +1,19 @@
package de.anomic.plasma.dbImport;
package de.anomic.crawler;
import java.util.Vector;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.logging.serverLog;
public class dbImportManager {
public class ImporterManager {
public final Vector<dbImporter> finishedJobs = new Vector<dbImporter>();
public final Vector<Importer> finishedJobs = new Vector<Importer>();
public final ThreadGroup runningJobs = new ThreadGroup("ImporterThreads");
public int currMaxJobNr = 0;
private plasmaSwitchboard sb;
public dbImportManager(plasmaSwitchboard theSb) {
this.sb = theSb;
public ImporterManager() {
}
private int generateUniqueJobID() {
public int generateUniqueJobID() {
int jobID;
synchronized(this.runningJobs) {
jobID = this.currMaxJobNr;
@ -25,27 +22,27 @@ public class dbImportManager {
return jobID;
}
public dbImporter[] getRunningImporter() {
public Importer[] getRunningImporter() {
Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2];
int activeCount = this.runningJobs.enumerate(importThreads);
dbImporter[] importers = new dbImporter[activeCount];
Importer[] importers = new Importer[activeCount];
for (int i=0; i<activeCount; i++) {
importers[i] = (dbImporter) importThreads[i];
importers[i] = (Importer) importThreads[i];
}
return importers;
}
public dbImporter[] getFinishedImporter() {
return (dbImporter[]) this.finishedJobs.toArray(new dbImporter[this.finishedJobs.size()]);
public Importer[] getFinishedImporter() {
return (Importer[]) this.finishedJobs.toArray(new Importer[this.finishedJobs.size()]);
}
public dbImporter getImporterByID(int jobID) {
public Importer getImporterByID(int jobID) {
Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2];
int activeCount = this.runningJobs.enumerate(importThreads);
for (int i=0; i < activeCount; i++) {
dbImporter currThread = (dbImporter) importThreads[i];
Importer currThread = (Importer) importThreads[i];
if (currThread.getJobID() == jobID) {
return currThread;
}
@ -53,25 +50,6 @@ public class dbImportManager {
return null;
}
public dbImporter getNewImporter(String type) {
if (type == null) return null;
if (type.length() == 0) return null;
// create a new importer thread
dbImporter newImporter = null;
if (type.equalsIgnoreCase("NURL")) {
newImporter = new plasmaCrawlNURLImporter(this.sb);
} else if (type.equalsIgnoreCase("sitemap")) {
newImporter = new SitemapImporter(this.sb);
}
// assign a job ID to it
newImporter.setJobID(this.generateUniqueJobID());
// return the newly created importer
return newImporter;
}
/**
* Can be used to close all still running importer threads
* e.g. on server shutdown
@ -94,7 +72,7 @@ public class dbImportManager {
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) {
((dbImporter)currentThread).stopIt();
((Importer)currentThread).stopIt();
}
}

@ -1,4 +1,4 @@
package de.anomic.plasma.dbImport;
package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
@ -6,12 +6,9 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.NoticedURL;
import de.anomic.plasma.plasmaSwitchboard;
public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImporter {
public class NoticeURLImporter extends AbstractImporter implements Importer {
private File plasmaPath = null;
private HashSet<String> importProfileHandleCache = new HashSet<String>();
@ -20,45 +17,24 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
private int importStartSize;
private int urlCount = 0;
private int profileCount = 0;
private CrawlQueues crawlQueues;
private CrawlProfile activeCrawls;
private ImporterManager dbImportManager;
public plasmaCrawlNURLImporter(plasmaSwitchboard theSb) {
super("NURL",theSb);
}
public long getEstimatedTime() {
return (this.urlCount==0)?0:((this.importStartSize*getElapsedTime())/(this.urlCount))-getElapsedTime();
}
public String getJobName() {
return this.plasmaPath.toString();
}
public int getProcessingStatusPercent() {
return (this.urlCount)/((this.importStartSize<100)?1:(this.importStartSize)/100);
}
public String getStatus() {
StringBuffer theStatus = new StringBuffer();
theStatus.append("#URLs=").append(this.urlCount).append("\n");
theStatus.append("#Profiles=").append(this.profileCount);
return theStatus.toString();
}
public void init(plasmaSwitchboard sb, int cacheSize) throws ImporterException {
super.init();
public NoticeURLImporter(File crawlerPath, CrawlQueues crawlQueues, CrawlProfile activeCrawls, ImporterManager dbImportManager) throws ImporterException {
super("NURL");
this.crawlQueues = crawlQueues;
this.activeCrawls = activeCrawls;
this.dbImportManager = dbImportManager;
// TODO: we need more errorhandling here
this.plasmaPath = sb.plasmaPath;
this.cacheSize = cacheSize;
if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024;
// TODO: we need more error handling here
this.plasmaPath = crawlerPath;
File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db");
File profileDbFile = new File(plasmaPath, plasmaSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
String errorMsg = null;
if (!plasmaPath.exists())
errorMsg = "The import path '" + plasmaPath+ "' does not exist.";
errorMsg = "The import path '" + plasmaPath + "' does not exist.";
else if (!plasmaPath.isDirectory())
errorMsg = "The import path '" + plasmaPath + "' is not a directory.";
else if (!plasmaPath.canRead())
@ -100,6 +76,27 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
this.importProfileDB = new CrawlProfile(profileDbFile);
}
public long getEstimatedTime() {
return (this.urlCount==0)?0:((this.importStartSize*getElapsedTime())/(this.urlCount))-getElapsedTime();
}
public String getJobName() {
return this.plasmaPath.toString();
}
public int getProcessingStatusPercent() {
return (this.urlCount)/((this.importStartSize<100)?1:(this.importStartSize)/100);
}
public String getStatus() {
StringBuffer theStatus = new StringBuffer();
theStatus.append("#URLs=").append(this.urlCount).append("\n");
theStatus.append("#Profiles=").append(this.profileCount);
return theStatus.toString();
}
@SuppressWarnings("unchecked")
public void run() {
try {
@ -161,7 +158,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
if (!this.importProfileHandleCache.contains(profileHandle)) {
// testing if the profile is already known
CrawlProfile.entry profileEntry = this.sb.profilesActiveCrawls.getEntry(profileHandle);
CrawlProfile.entry profileEntry = this.activeCrawls.getEntry(profileHandle);
// if not we need to import it
if (profileEntry == null) {
@ -170,7 +167,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
if (sourceEntry != null) {
this.profileCount++;
this.importProfileHandleCache.add(profileHandle);
this.sb.profilesActiveCrawls.newEntry((HashMap<String, String>) sourceEntry.map().clone());
this.activeCrawls.newEntry((HashMap<String, String>) sourceEntry.map().clone());
} else {
this.log.logWarning("Profile '" + profileHandle + "' of url entry '" + nextHash + "' unknown.");
continue;
@ -179,8 +176,8 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
}
// if the url does not alredy exists in the destination stack we insert it now
if (!this.sb.crawlQueues.noticeURL.existsInStack(nextHash)) {
this.sb.crawlQueues.noticeURL.push((stackTypes[stackType] != -1) ? stackTypes[stackType] : NoticedURL.STACK_TYPE_CORE, nextEntry);
if (!this.crawlQueues.noticeURL.existsInStack(nextHash)) {
this.crawlQueues.noticeURL.push((stackTypes[stackType] != -1) ? stackTypes[stackType] : NoticedURL.STACK_TYPE_CORE, nextEntry);
}
// removing hash from the import db
@ -207,7 +204,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
} finally {
this.log.logInfo("Import process finished.");
this.globalEnd = System.currentTimeMillis();
this.sb.dbImportManager.finishedJobs.add(this);
this.dbImportManager.finishedJobs.add(this);
this.importNurlDB.close();
this.importProfileDB.close();
}

@ -1,4 +1,4 @@
//AbstractParser.java
//SitemapImporter.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@anomic.de
@ -42,24 +42,34 @@
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.dbImport;
package de.anomic.crawler;
import java.util.HashMap;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.SitemapParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.yacy.yacyURL;
public class SitemapImporter extends AbstractImporter implements dbImporter {
public class SitemapImporter extends AbstractImporter implements Importer {
private SitemapParser parser = null;
private yacyURL sitemapURL = null;
private ImporterManager superviser;
public SitemapImporter(plasmaSwitchboard switchboard) {
super("sitemap",switchboard);
}
public SitemapImporter(plasmaSwitchboard sb, ImporterManager importManager, yacyURL sitemapURL, CrawlProfile.entry profileEntry) throws ImporterException {
super("sitemap");
this.superviser = importManager;
try {
// getting the sitemap URL
this.sitemapURL = sitemapURL;
// creating the sitemap parser
this.parser = new SitemapParser(sb, this.sitemapURL, profileEntry);
} catch (Exception e) {
throw new ImporterException("Unable to initialize Importer",e);
}
}
public long getEstimatedTime() {
long t = getElapsedTime();
int p = getProcessingStatusPercent();
@ -67,14 +77,14 @@ public class SitemapImporter extends AbstractImporter implements dbImporter {
}
/**
* @see dbImporter#getJobName()
* @see Importer#getJobName()
*/
public String getJobName() {
return this.sitemapURL.toString();
}
/**
* @see dbImporter#getProcessingStatusPercent()
* @see Importer#getProcessingStatusPercent()
*/
public int getProcessingStatusPercent() {
if (this.parser == null) return 0;
@ -87,7 +97,7 @@ public class SitemapImporter extends AbstractImporter implements dbImporter {
}
/**
* @see dbImporter#getStatus()
* @see Importer#getStatus()
*/
public String getStatus() {
StringBuffer theStatus = new StringBuffer();
@ -96,33 +106,13 @@ public class SitemapImporter extends AbstractImporter implements dbImporter {
return theStatus.toString();
}
/**
* @see dbImporter#init(HashMap)
* @see AbstractImporter#init(HashMap)
*/
public void init(plasmaSwitchboard switchboard, int cacheSize) throws ImporterException {
super.init();
}
public void initSitemap(yacyURL sitemapURL, CrawlProfile.entry profileEntry) throws ImporterException {
try {
// getting the sitemap URL
this.sitemapURL = sitemapURL;
// creating the sitemap parser
this.parser = new SitemapParser(this.sb,this.sitemapURL, profileEntry);
} catch (Exception e) {
throw new ImporterException("Unable to initialize Importer",e);
}
}
public void run() {
try {
this.parser.parse();
} finally {
this.globalEnd = System.currentTimeMillis();
this.sb.dbImportManager.finishedJobs.add(this);
this.superviser.finishedJobs.add(this);
}
}
}

@ -110,11 +110,6 @@ public class SitemapParser extends DefaultHandler {
*/
private CrawlProfile.entry crawlingProfile = null;
/**
* Reference to the plasmaswitchboard.
*/
private plasmaSwitchboard switchboard = null;
/**
* Name of the current XML element
*/
@ -154,13 +149,11 @@ public class SitemapParser extends DefaultHandler {
* last modification date of the {@link #nextURL}
*/
private Date lastMod = null;
private plasmaSwitchboard sb;
public SitemapParser(plasmaSwitchboard sb, yacyURL sitemap, CrawlProfile.entry theCrawlingProfile) {
if (sb == null)
throw new NullPointerException("The switchboard must not be null");
if (sitemap == null)
throw new NullPointerException("The sitemap URL must not be null");
this.switchboard = sb;
assert sitemap != null;
this.sb = sb;
this.siteMapURL = sitemap;
if (theCrawlingProfile == null) {
@ -281,10 +274,10 @@ public class SitemapParser extends DefaultHandler {
// check if the url is known and needs to be recrawled
if (this.lastMod != null) {
String dbocc = this.switchboard.urlExists(nexturlhash);
String dbocc = this.sb.urlExists(nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date
indexURLReference oldEntry = switchboard.wordIndex.getURL(nexturlhash, null, 0);
indexURLReference oldEntry = this.sb.wordIndex.getURL(nexturlhash, null, 0);
if (oldEntry != null) {
Date modDate = oldEntry.moddate();
// check if modDate is null
@ -296,9 +289,9 @@ public class SitemapParser extends DefaultHandler {
// URL needs to crawled
String error = null;
error = this.switchboard.crawlStacker.stackCrawl(url,
error = this.sb.crawlStacker.stackCrawl(url,
null, // this.siteMapURL.toString(),
this.switchboard.wordIndex.seedDB.mySeed().hash, this.nextURL, new Date(),
this.sb.wordIndex.seedDB.mySeed().hash, this.nextURL, new Date(),
0, this.crawlingProfile);
if (error != null) {
@ -306,9 +299,9 @@ public class SitemapParser extends DefaultHandler {
this.logger.logInfo("The URL '" + this.nextURL + "' can not be crawled. Reason: " + error);
// insert URL into the error DB
ZURL.Entry ee = this.switchboard.crawlQueues.errorURL.newEntry(
ZURL.Entry ee = this.sb.crawlQueues.errorURL.newEntry(
new CrawlEntry(
switchboard.wordIndex.seedDB.mySeed().hash,
sb.wordIndex.seedDB.mySeed().hash,
new yacyURL(this.nextURL, null),
"",
"",
@ -317,12 +310,12 @@ public class SitemapParser extends DefaultHandler {
0,
0,
0),
this.switchboard.wordIndex.seedDB.mySeed().hash,
this.sb.wordIndex.seedDB.mySeed().hash,
new Date(),
1,
error);
ee.store();
this.switchboard.crawlQueues.errorURL.push(ee);
this.sb.crawlQueues.errorURL.push(ee);
} catch (MalformedURLException e) {/* ignore this */
}
} else {
@ -353,7 +346,7 @@ public class SitemapParser extends DefaultHandler {
}
private CrawlProfile.entry createProfile(String domainName, yacyURL sitemapURL) {
return this.switchboard.profilesActiveCrawls.newEntry(domainName, sitemapURL,
return this.sb.profilesActiveCrawls.newEntry(domainName, sitemapURL,
// crawlingFilter
".*", ".*",
// Depth

@ -57,26 +57,9 @@ public final class indexRepositoryReference {
kelondroIndex urlIndexFile;
private Export exportthread = null; // will habe a export thread assigned if exporter is running
public indexRepositoryReference(File indexSecondaryRoot, String networkName) {
public indexRepositoryReference(File indexSecondaryPath) {
super();
File indexSecondaryPath = new File(indexSecondaryRoot, networkName);
File indexSecondaryTextLocation = new File(indexSecondaryPath, "TEXT");
if (!indexSecondaryTextLocation.exists()) {
// patch old index locations; the secondary path is patched in plasmaCrawlLURL
File oldSecondaryPath = new File(new File(indexSecondaryRoot, "PUBLIC"), "TEXT");
File oldSecondaryTextLocation = new File(new File(indexSecondaryRoot, "PUBLIC"), "TEXT");
if (oldSecondaryPath.exists() && oldSecondaryTextLocation.exists()) {
// move the text folder from the old location to the new location
assert !indexSecondaryTextLocation.exists();
indexSecondaryTextLocation.mkdirs();
if (oldSecondaryTextLocation.renameTo(indexSecondaryTextLocation)) {
if (!oldSecondaryPath.delete()) oldSecondaryPath.deleteOnExit();
} else {
indexSecondaryTextLocation = oldSecondaryTextLocation; // emergency case: stay with old directory
}
}
}
File indexSecondaryTextLocation = new File(indexSecondaryPath, "TEXT");
urlIndexFile = new kelondroSplitTable(indexSecondaryTextLocation, "urls", indexURLReference.rowdef, false);
}

@ -1,22 +1,18 @@
package de.anomic.plasma.dbImport;
package de.anomic.plasma;
import java.io.File;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.crawler.AbstractImporter;
import de.anomic.crawler.Importer;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIRowEntry;
import de.anomic.index.indexURLReference;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDate;
public class plasmaDbImporter extends AbstractImporter implements dbImporter {
private File importPrimaryPath, importSecondaryPath;
public class plasmaDbImporter extends AbstractImporter implements Importer {
/**
* the source word index (the DB to import)
@ -36,21 +32,22 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0;
public plasmaDbImporter(plasmaSwitchboard sb, plasmaWordIndex homeWI, plasmaWordIndex importWI) {
super("PLASMADB",sb);
public plasmaDbImporter(plasmaWordIndex homeWI, plasmaWordIndex importWI) {
super("PLASMADB");
this.homeWordIndex = homeWI;
this.importWordIndex = importWI;
this.importStartSize = this.importWordIndex.size();
}
/**
* @see dbImporter#getJobName()
* @see Importer#getJobName()
*/
public String getJobName() {
return this.importPrimaryPath.toString();
return this.importWordIndex.getLocation(true).toString();
}
/**
* @see dbImporter#getStatus()
* @see Importer#getStatus()
*/
public String getStatus() {
StringBuffer theStatus = new StringBuffer();
@ -64,46 +61,6 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
return theStatus.toString();
}
//public void init(File thePrimaryPath, File theSecondaryPath, int theCacheSize, long preloadTime) {
/**
* @throws ImporterException
* @see dbImporter#init(HashMap)
*/
public void init(plasmaSwitchboard db, int cacheSize) throws ImporterException {
super.init();
// TODO: we need more errorhandling here
this.importPrimaryPath = sb.indexPrimaryPath;
this.importSecondaryPath = sb.indexSecondaryPath;
this.cacheSize = cacheSize;
if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024;
// configure import DB
String errorMsg = null;
if (!this.importPrimaryPath.exists()) errorMsg = "Primary Import directory does not exist.";
if (!this.importPrimaryPath.canRead()) errorMsg = "Primary Import directory is not readable.";
if (!this.importPrimaryPath.canWrite()) errorMsg = "Primary Import directory is not writeable";
if (!this.importPrimaryPath.isDirectory()) errorMsg = "Primary Import directory is not a directory.";
if (errorMsg != null) {
this.log.logSevere(errorMsg + "\nName: " + this.importPrimaryPath.getAbsolutePath());
throw new IllegalArgumentException(errorMsg);
}
if (!this.importSecondaryPath.exists()) errorMsg = "Secondary Import directory does not exist.";
if (!this.importSecondaryPath.canRead()) errorMsg = "Secondary Import directory is not readable.";
if (!this.importSecondaryPath.canWrite()) errorMsg = "Secondary Import directory is not writeable";
if (!this.importSecondaryPath.isDirectory()) errorMsg = "Secondary Import directory is not a directory.";
if (errorMsg != null) {
this.log.logSevere(errorMsg + "\nName: " + this.importSecondaryPath.getAbsolutePath());
throw new IllegalArgumentException(errorMsg);
}
this.log.logFine("Initializing source word index db.");
this.importWordIndex = new plasmaWordIndex(sb.getConfig("network.unit.name", ""), this.log, this.importPrimaryPath, this.importSecondaryPath);
this.importStartSize = this.importWordIndex.size();
}
public void run() {
try {
importWordsDB();
@ -114,7 +71,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
/**
* @see dbImporter#getProcessingStatusPercent()
* @see Importer#getProcessingStatusPercent()
*/
public int getProcessingStatusPercent() {
// thid seems to be better:
@ -125,7 +82,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
/**
* @see dbImporter#getElapsedTime()
* @see Importer#getElapsedTime()
*/
public long getEstimatedTime() {
return (this.wordCounter==0)?0:((this.importStartSize*getElapsedTime())/this.wordCounter)-getElapsedTime();
@ -135,7 +92,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
this.log.logInfo("STARTING DB-IMPORT");
try {
this.log.logInfo("Importing DB from '" + this.importPrimaryPath.getAbsolutePath() + "'/'" + this.importSecondaryPath.getAbsolutePath() + "'");
this.log.logInfo("Importing DB from '" + this.importWordIndex.getLocation(true).getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.countURL() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.countURL() + " URLs.");
@ -267,7 +224,5 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
if (this.importWordIndex != null) try { this.importWordIndex.close(); } catch (Exception e){}
}
}
}

@ -118,6 +118,7 @@ import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.CrawlStacker;
import de.anomic.crawler.ProtocolLoader;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.ImporterManager;
import de.anomic.data.URLLicense;
import de.anomic.data.blogBoard;
import de.anomic.data.blogBoardComments;
@ -144,7 +145,6 @@ import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroMapTable;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.dbImport.dbImportManager;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverBusyThread;
@ -198,58 +198,57 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
public static wikiParser wikiParser = null;
// storage management
public File htCachePath;
public File plasmaPath;
public File indexPrimaryPath, indexSecondaryPath;
public File listsPath;
public File htDocsPath;
public File rankingPath;
public File workPath;
public File releasePath;
public HashMap<String, String> rankingPermissions;
public plasmaWordIndex wordIndex;
public CrawlQueues crawlQueues;
public ResultURLs crawlResults;
public plasmaSwitchboardQueue sbQueue;
public CrawlStacker crawlStacker;
public messageBoard messageDB;
public wikiBoard wikiDB;
public blogBoard blogDB;
public blogBoardComments blogCommentDB;
public static RobotsTxt robots = null;
public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls;
public CrawlProfile.entry defaultProxyProfile;
public CrawlProfile.entry defaultRemoteProfile;
public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public boolean rankingOn;
public plasmaRankingDistribution rankingOwnDistribution;
public plasmaRankingDistribution rankingOtherDistribution;
public HashMap<String, Object[]> outgoingCookies, incomingCookies;
public kelondroMapTable facilityDB;
public plasmaParser parser;
public volatile long proxyLastAccess, localSearchLastAccess, remoteSearchLastAccess;
public yacyCore yc;
public userDB userDB;
public bookmarksDB bookmarksDB;
public plasmaWebStructure webStructure;
public dbImportManager dbImportManager;
public plasmaDHTFlush transferIdxThread = null;
private plasmaDHTChunk dhtTransferChunk = null;
public ArrayList<plasmaSearchQuery> localSearches; // array of search result properties as HashMaps
public ArrayList<plasmaSearchQuery> remoteSearches; // array of search result properties as HashMaps
public File htCachePath;
public File plasmaPath;
public File listsPath;
public File htDocsPath;
public File rankingPath;
public File workPath;
public File releasePath;
public HashMap<String, String> rankingPermissions;
public plasmaWordIndex wordIndex;
public CrawlQueues crawlQueues;
public ResultURLs crawlResults;
public plasmaSwitchboardQueue sbQueue;
public CrawlStacker crawlStacker;
public messageBoard messageDB;
public wikiBoard wikiDB;
public blogBoard blogDB;
public blogBoardComments blogCommentDB;
public static RobotsTxt robots = null;
public CrawlProfile profilesActiveCrawls, profilesPassiveCrawls;
public CrawlProfile.entry defaultProxyProfile;
public CrawlProfile.entry defaultRemoteProfile;
public CrawlProfile.entry defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile.entry defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public boolean rankingOn;
public plasmaRankingDistribution rankingOwnDistribution;
public plasmaRankingDistribution rankingOtherDistribution;
public HashMap<String, Object[]> outgoingCookies, incomingCookies;
public kelondroMapTable facilityDB;
public plasmaParser parser;
public volatile long proxyLastAccess, localSearchLastAccess, remoteSearchLastAccess;
public yacyCore yc;
public userDB userDB;
public bookmarksDB bookmarksDB;
public plasmaWebStructure webStructure;
public ImporterManager dbImportManager;
public plasmaDHTFlush transferIdxThread = null;
private plasmaDHTChunk dhtTransferChunk = null;
public ArrayList<plasmaSearchQuery> localSearches; // array of search result properties as HashMaps
public ArrayList<plasmaSearchQuery> remoteSearches; // array of search result properties as HashMaps
public HashMap<String, TreeSet<Long>> localSearchTracker, remoteSearchTracker; // mappings from requesting host to a TreeSet of Long(access time)
public long lastseedcheckuptime = -1;
public long indexedPages = 0;
public long lastindexedPages = 0;
public double requestedQueries = 0d;
public double lastrequestedQueries = 0d;
public int totalPPM = 0;
public double totalQPM = 0d;
public TreeMap<String, String> clusterhashes; // map of peerhash(String)/alternative-local-address as ip:port or only ip (String) or null if address in seed should be used
public boolean acceptLocalURLs, acceptGlobalURLs;
public URLLicense licensedURLs;
public Timer moreMemory;
public long lastseedcheckuptime = -1;
public long indexedPages = 0;
public long lastindexedPages = 0;
public double requestedQueries = 0d;
public double lastrequestedQueries = 0d;
public int totalPPM = 0;
public double totalQPM = 0d;
public TreeMap<String, String> clusterhashes; // map of peerhash(String)/alternative-local-address as ip:port or only ip (String) or null if address in seed should be used
public boolean acceptLocalURLs, acceptGlobalURLs;
public URLLicense licensedURLs;
public Timer moreMemory;
public serverProcessor<indexingQueueEntry> indexingDocumentProcessor;
public serverProcessor<indexingQueueEntry> indexingCondensementProcessor;
@ -921,13 +920,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
this.acceptGlobalURLs = "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0;
this.acceptLocalURLs = "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0;
// load values from configs
// load values from configs
this.plasmaPath = getConfigPath(PLASMA_PATH, PLASMA_PATH_DEFAULT);
this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString());
this.indexPrimaryPath = getConfigPath(INDEX_PRIMARY_PATH, INDEX_PATH_DEFAULT);
this.log.logConfig("Index Primary Path: " + this.indexPrimaryPath.toString());
this.indexSecondaryPath = (getConfig(INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(INDEX_SECONDARY_PATH, ""));
this.log.logConfig("Index Secondary Path: " + this.indexSecondaryPath.toString());
File indexPrimaryPath = getConfigPath(INDEX_PRIMARY_PATH, INDEX_PATH_DEFAULT);
this.log.logConfig("Index Primary Path: " + indexPrimaryPath.toString());
File indexSecondaryPath = (getConfig(INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(INDEX_SECONDARY_PATH, ""));
this.log.logConfig("Index Secondary Path: " + indexSecondaryPath.toString());
this.listsPath = getConfigPath(LISTS_PATH, LISTS_PATH_DEFAULT);
this.log.logConfig("Lists Path: " + this.listsPath.toString());
this.htDocsPath = getConfigPath(HTDOCS_PATH, HTDOCS_PATH_DEFAULT);
@ -1277,7 +1276,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<plasmaSwitchbo
//plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
//plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/kiosk/archiv/ct/2003/4/20"), query, true, 260);
this.dbImportManager = new dbImportManager(this);
this.dbImportManager = new ImporterManager();
log.logConfig("Finished Switchboard Initialization");
}

@ -82,13 +82,13 @@ public final class plasmaWordIndex implements indexRI {
final indexRepositoryReference referenceURL;
public yacySeedDB seedDB;
public yacyNewsPool newsPool;
private File primaryRoot, secondaryRoot;
public plasmaWordIndex(String networkName, serverLog log, File indexPrimaryRoot, File indexSecondaryRoot) {
this.log = log;
File indexPrimaryPath = new File(indexPrimaryRoot, networkName);
File indexPrimaryTextLocation = new File(indexPrimaryPath, "TEXT");
this.primaryRoot = new File(indexPrimaryRoot, networkName);
this.secondaryRoot = new File(indexSecondaryRoot, networkName);
File indexPrimaryTextLocation = new File(this.primaryRoot, "TEXT");
if (!indexPrimaryTextLocation.exists()) {
// patch old index locations; the secondary path is patched in plasmaCrawlLURL
File oldPrimaryPath = new File(new File(indexPrimaryRoot, "PUBLIC"), "TEXT");
@ -116,10 +116,10 @@ public final class plasmaWordIndex implements indexRI {
this.collections = new indexCollectionRI(textindexcollections, "collection", maxCollectionPartition, indexRWIRowEntry.urlEntryRow);
// create LURL-db
referenceURL = new indexRepositoryReference(indexSecondaryRoot, networkName);
referenceURL = new indexRepositoryReference(this.secondaryRoot);
// create or init seed cache
File networkRoot = new File(indexPrimaryPath, "NETWORK");
File networkRoot = new File(this.primaryRoot, "NETWORK");
networkRoot.mkdirs();
File mySeedFile = new File(networkRoot, "mySeed.txt");
File oldSeedFile = new File(new File(indexPrimaryRoot.getParentFile(), "YACYDB"), "mySeed.txt");
@ -133,7 +133,10 @@ public final class plasmaWordIndex implements indexRI {
// create or init news database
newsPool = new yacyNewsPool(networkRoot);
}
public File getLocation(boolean primary) {
return (primary) ? this.primaryRoot : this.secondaryRoot;
}
public void putURL(indexURLReference entry) throws IOException {

@ -635,10 +635,10 @@ public final class yacy {
log.logInfo("STARTING URL CLEANUP");
// db containing all currently loades urls
indexRepositoryReference currentUrlDB = new indexRepositoryReference(indexSecondaryRoot, networkName);
indexRepositoryReference currentUrlDB = new indexRepositoryReference(new File(indexSecondaryRoot, networkName));
// db used to hold all neede urls
indexRepositoryReference minimizedUrlDB = new indexRepositoryReference(indexRoot2, networkName);
indexRepositoryReference minimizedUrlDB = new indexRepositoryReference(new File(indexRoot2, networkName));
int cacheMem = (int)(serverMemory.max() - serverMemory.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
@ -817,7 +817,7 @@ public final class yacy {
File root = homePath;
File indexroot = new File(root, "DATA/INDEX");
try {serverLog.configureLogging(homePath, new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
indexRepositoryReference currentUrlDB = new indexRepositoryReference(indexroot, networkName);
indexRepositoryReference currentUrlDB = new indexRepositoryReference(new File(indexroot, networkName));
currentUrlDB.deadlinkCleaner(null);
currentUrlDB.close();
}

Loading…
Cancel
Save