*) Bookmarks: Ajax icon is displayed while loading title

*) First version of a sitemap parser added
   - currently only autodetection of sitemap files is supported
*) DB-Import restructured
   - pause/resume should work again now


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3666 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 18 years ago
parent 269d5ca45b
commit 6f46245a51

@ -44,6 +44,7 @@
::
<input type="submit" name="add" value="edit" />
#(/edit)#
<img src="/env/grafics/empty.gif" name="ajax" />
</fieldset>
</form>
::

@ -26,22 +26,29 @@
<tr valign="top" class="TableCellSummary">
<td>Starting Point:</td>
<td>
<table cellpadding="0" cellspacing="0">
<table cellpadding="0" cellspacing="0">
<tr>
<td><label for="url"><nobr>From URL</nobr></label>:</td>
<td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
<td>
<input name="crawlingURL" type="text" size="41" maxlength="256" value="http://" onkeypress="changed()" />
<span id="robotsOK"></span>
<span id="robotsOK"></span>
</td>
</tr>
<tr>
<td><label for="url"><nobr>From Sitemap</nobr></label>:</td>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="true"/></td>
<td>
<input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="true"/>
</td>
</tr>
<tr>
<td><label for="file"><nobr>From File</nobr></label>:</td>
<td><input type="radio" name="crawlingMode" id="file" value="file" /></td>
<td><input type="file" name="crawlingFile" size="28" /></td>
</tr>
<tr>
<td colspan="3" class="commit"><span id="title"><br></span></td>
<td colspan="3" class="commit"><span id="title"><br></span><img src="/env/grafics/empty.gif" name="ajax" /></td>
</tr>
</table>
</td>

@ -51,6 +51,7 @@
import java.io.File;
import java.io.PrintStream;
import java.util.Date;
import java.util.HashMap;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
@ -78,12 +79,13 @@ public final class IndexImport_p {
String importIndexSecondaryPath = (String) post.get("importIndexSecondaryPath");
String importType = (String) post.get("importType");
String cacheSizeStr = (String) post.get("cacheSize");
int cacheSize = 8*1024*1024;
try {
cacheSize = Integer.valueOf(cacheSizeStr).intValue();
} catch (NumberFormatException e) {}
boolean startImport = true;
HashMap initParams = new HashMap();
initParams.put("plasmaPath",importPlasmaPath);
initParams.put("cacheSize",cacheSizeStr);
initParams.put("preloadTime","100");
// // check if there is an already running thread with the same import path
// Thread[] importThreads = new Thread[plasmaDbImporter.runningJobs.activeCount()*2];
// activeCount = plasmaDbImporter.runningJobs.enumerate(importThreads);
@ -100,7 +102,7 @@ public final class IndexImport_p {
if (startImport) {
dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
if (importerThread != null) {
importerThread.init(new File(importPlasmaPath), new File(importIndexPrimaryPath), new File(importIndexSecondaryPath), cacheSize, 100);
importerThread.init(initParams);
importerThread.startIt();
}
prop.put("LOCATION","");

@ -9,7 +9,7 @@
<script type="text/javascript" src="/js/WatchCrawler.js"></script></head>
<body id="watchCrawler"> #%env/templates/header.template%#
<h2>Crawler Monitor</h2>
<p> Next update in <span id="nextUpdate" onclick="changeInterval()"></span> seconds.
<p> Next update in <span id="nextUpdate" onclick="changeInterval()"></span> seconds. <img src="/env/grafics/empty.gif" name="ajax" />
</p>
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>

@ -35,6 +35,7 @@ import java.util.Map;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import de.anomic.data.SitemapParser;
import de.anomic.data.wikiCode;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
@ -44,6 +45,7 @@ import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.dbImport.dbImporter;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -52,6 +54,10 @@ import de.anomic.yacy.yacyNewsPool;
import de.anomic.yacy.yacyNewsRecord;
public class WatchCrawler_p {
public static final String CRAWLING_MODE_URL = "url";
public static final String CRAWLING_MODE_FILE = "file";
public static final String CRAWLING_MODE_SITEMAP = "sitemap";
// this servlet does NOT create the WatchCrawler page content!
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
@ -144,7 +150,7 @@ public class WatchCrawler_p {
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
String crawlingMode = post.get("crawlingMode","url");
if (crawlingMode.equals("url")) {
if (crawlingMode.equals(CRAWLING_MODE_URL)) {
// getting the crawljob start url
String crawlingStart = post.get("crawlingURL","");
crawlingStart = crawlingStart.trim();
@ -236,7 +242,7 @@ public class WatchCrawler_p {
e.printStackTrace();
}
} else if (crawlingMode.equals("file")) {
} else if (crawlingMode.equals(CRAWLING_MODE_FILE)) {
if (post.containsKey("crawlingFile")) {
// getting the name of the uploaded file
String fileName = (String) post.get("crawlingFile");
@ -316,6 +322,38 @@ public class WatchCrawler_p {
e.printStackTrace();
}
}
} else if (crawlingMode.equals(CRAWLING_MODE_SITEMAP)) {
String sitemapURLStr = null;
try {
// getting the sitemap URL
sitemapURLStr = post.get("sitemapURL","");
// create a new profile
plasmaCrawlProfile.entry pe = switchboard.profiles.newEntry(
sitemapURLStr, sitemapURLStr, newcrawlingfilter, newcrawlingfilter,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
// create a new sitemap importer
dbImporter importerThread = switchboard.dbImportManager.getNewImporter("sitemap");
if (importerThread != null) {
HashMap initParams = new HashMap();
initParams.put("sitemapURL",sitemapURLStr);
initParams.put("crawlingProfile",pe.handle());
importerThread.init(initParams);
importerThread.startIt();
}
} catch (Exception e) {
// mist
prop.put("info", 6);//Error with url
prop.put("info_crawlingStart", sitemapURLStr);
prop.put("info_error", e.getMessage());
e.printStackTrace();
}
}
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

@ -1,11 +1,20 @@
AJAX_OFF="/env/grafics/empty.gif";
AJAX_ON="/env/grafics/ajax.gif";
function handleResponse(){
if(http.readyState == 4){
var response = http.responseXML;
title=response.getElementsByTagName("title")[0].firstChild.nodeValue;
document.getElementsByName("title")[0].value=title;
// remove the ajax image
document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
}
}
function loadTitle(){
// displaying ajax image
document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON);
url=document.getElementsByName("url")[0].value;
if(document.getElementsByName("title")[0].value==""){
sndReq('/xml/util/getpageinfo_p.xml?actions=title&url='+url);

@ -1,16 +1,22 @@
AJAX_OFF="/env/grafics/empty.gif";
AJAX_ON="/env/grafics/ajax.gif";
timeout="";
function handleResponse(){
if(http.readyState == 4){
var response = http.responseXML;
title="";
robotsOK="";
// getting the document title
title="";
if(response.getElementsByTagName("title")[0].firstChild!=null){
title=response.getElementsByTagName("title")[0].firstChild.nodeValue;
}
document.getElementById("title").innerHTML=title;
// deterime if crawling is allowed by the robots.txt
robotsOK="";
if(response.getElementsByTagName("robots")[0].firstChild!=null){
robotsOK=response.getElementsByTagName("robots")[0].firstChild.nodeValue;
}
document.getElementById("title").innerHTML=title;
robotsOKspan=document.getElementById("robotsOK");
if(robotsOKspan.firstChild){
robotsOKspan.removeChild(robotsOKspan.firstChild);
@ -31,7 +37,18 @@ function handleResponse(){
}else{
robotsOKspan.appendChild(document.createTextNode(""));
document.getElementById("robotsOK").innerHTML="";
}
}
// getting the sitemap URL contained in the robots.txt
sitemap="";
if(response.getElementsByTagName("sitemap")[0].firstChild!=null){
sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
}
document.getElementsByName("sitemapURL")[0].value=sitemap;
document.getElementById("sitemap").disabled=false;
// clear the ajax image
document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
}
}
function changed(){
@ -39,6 +56,9 @@ function changed(){
timeout=window.setTimeout("loadInfos()", 1500);
}
function loadInfos(){
// displaying ajax image
document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON);
url=document.getElementsByName("crawlingURL")[0].value;
sndReq('/xml/util/getpageinfo_p.xml?actions=title,robots&url='+url);
}

@ -101,11 +101,18 @@ public class getpageinfo_p {
}
if(actions.indexOf("robots")>=0){
try {
if(robotsParser.isDisallowed(new URL(url))){
URL theURL = new URL(url);
// determine if crawling of the current URL is allowed
if(robotsParser.isDisallowed(theURL)){
prop.put("robots-allowed", 0);
}else{
prop.put("robots-allowed", 1);
}
// get the sitemap URL of the domain
URL sitemapURL = robotsParser.getSitemapURL(theURL);
prop.put("sitemap", (sitemapURL==null)?"":sitemapURL.toString());
} catch (MalformedURLException e) {}
}

@ -2,4 +2,5 @@
<pageinfo>
<title>#[title]#</title>
<robots>#(robots-allowed)#0::1::#(/robots-allowed)#</robots>
<sitemap>#[sitemap]#</sitemap>
</pageinfo>

@ -0,0 +1,342 @@
//SitemapParser.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2007
//
//this file is contributed by Martin Thelian
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.data;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import de.anomic.http.httpc;
import de.anomic.http.httpdByteCountInputStream;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
/**
* Class to parse a sitemap file.<br>
* An example sitemap file is depicted below:<br>
* <pre>
* &lt;?xml version="1.0" encoding="UTF-8"?&gt;
* &lt;urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"&gt;
* &lt;url&gt;
* &lt;loc&gt;http://www.example.com/&lt;/loc&gt;
* &lt;lastmod&gt;2005-01-01&lt;/lastmod&gt;
* &lt;changefreq&gt;monthly&lt;/changefreq&gt;
* &lt;priority&gt;0.8&lt;/priority&gt;
* &lt;/url&gt;
* &lt;/urlset&gt;
* </pre>
*
* A real example can be found here: http://www.xt-service.de/sitemap.xml
* An example robots.txt containing a sitemap URL: http://notepad.emaillink.de/robots.txt
*
* @see http://www.sitemaps.org/protocol.php
* @see https://www.google.com/webmasters/tools/docs/en/protocol.html
*/
public class SitemapParser extends DefaultHandler {
public static final String XMLNS_SITEMAPS_ORG = "http://www.sitemaps.org/schemas/sitemap/0.9";
public static final String XMLNS_SITEMAPS_GOOGLE = "http://www.google.com/schemas/sitemap/0.84";
public static final String SITEMAP_XMLNS = "xmlns";
public static final String SITEMAP_URLSET = "urlset";
public static final String SITEMAP_URL = "url";
public static final String SITEMAP_URL_LOC = "loc";
public static final String SITEMAP_URL_LASTMOD = "lastmod";
public static final String SITEMAP_URL_CHANGEFREQ = "changefreq";
public static final String SITEMAP_URL_PRIORITY = "priority";
// TODO: which local settings should we use here?
private final SimpleDateFormat dateFormater = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
/**
* The crawling profile used to parse the URLs contained in the sitemap file
*/
private plasmaCrawlProfile.entry crawlingProfile = null;
/**
* Reference to the plasmaswitchboard.
*/
private plasmaSwitchboard switchboard = null;
/**
* Name of the current XML element
*/
private String currentElement = null;
/**
* A special stream to count how many bytes were processed so far
*/
private httpdByteCountInputStream counterStream;
/**
* The total length of the sitemap file
*/
private long contentLength;
/**
* The amount of urls processes so far
*/
private int urlCounter = 0;
private serverLog logger = new serverLog("SITEMAP");
private URL siteMapURL = null;
private String nextURL = null;
private Date lastMod = null;
public SitemapParser(plasmaSwitchboard sb, URL sitemap, plasmaCrawlProfile.entry theCrawlingProfile) {
if (sb == null) throw new NullPointerException("The switchboard must not be null");
this.switchboard = sb;
this.siteMapURL = sitemap;
if (theCrawlingProfile == null) {
this.crawlingProfile = createProfile(this.siteMapURL.getHost(),this.siteMapURL.toString());
} else {
this.crawlingProfile = theCrawlingProfile;
}
}
public void parse() {
// download document
httpc remote = null;
try {
remote = httpc.getInstance(
this.siteMapURL.getHost(),
this.siteMapURL.getHost(),
this.siteMapURL.getPort(),
5000,
this.siteMapURL.getProtocol().equalsIgnoreCase("https"));
httpc.response res = remote.GET(this.siteMapURL.getFile(), null);
if (res.statusCode != 200) {
throw new Exception("Unable to download the sitemap file. Server returned status: " + res.status);
}
// getting some metadata
String contentMimeType = res.responseHeader.mime();
this.contentLength = res.responseHeader.contentLength();
InputStream contentStream = res.getContentInputStream();
if ((contentMimeType != null) && (
contentMimeType.equals("application/x-gzip") ||
contentMimeType.equals("application/gzip")
)) {
contentStream = new GZIPInputStream(contentStream);
}
this.counterStream = new httpdByteCountInputStream(contentStream,null);
// parse it
SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser();
saxParser.parse(this.counterStream, this);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (remote != null) try { httpc.returnInstance(remote); } catch (Exception e) {}
}
}
public long getTotalLength() {
return this.contentLength;
}
public long getProcessedLength() {
return (this.counterStream==null)?0:this.counterStream.getCount();
}
public long getUrlcount() {
return this.urlCounter;
}
public void startDocument() throws SAXException {
// TODO: create a new crawling profile
}
/**
* @param localName local name
* @param qName qualified name
* @see DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
*/
public void startElement(
String namespaceURI,
String localName,
String qName,
Attributes attrs) throws SAXException {
this.currentElement = qName;
// testing if the namespace is known
if (qName.equalsIgnoreCase(SITEMAP_URLSET)) {
String namespace = attrs.getValue(SITEMAP_XMLNS);
if ((namespace == null) ||
((!namespace.equals(XMLNS_SITEMAPS_ORG)) &&
(!namespace.equals(XMLNS_SITEMAPS_GOOGLE)))
) throw new SAXException("Unknown sitemap namespace: " + namespace);
}
}
/**
* @param localName local name
* @param qName qualified name
* @see DefaultHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
*/
public void endElement( String namespaceURI,
String localName,
String qName ) {
this.currentElement = "";
if (qName.equalsIgnoreCase(SITEMAP_URL)) {
if (this.nextURL == null) return;
// get the url hash
String nexturlhash = plasmaURL.urlHash(this.nextURL);
// check if the url is known and needs to be recrawled
if (this.lastMod != null) {
String dbocc = this.switchboard.urlExists(nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
// the url was already loaded. we need to check the date
indexURLEntry oldEntry = this.switchboard.wordIndex.loadedURL.load(nexturlhash, null);
if (oldEntry != null) {
Date modDate = oldEntry.moddate();
// check if modDate is null
if (modDate.after(this.lastMod)) return;
}
}
}
// URL needs to crawled
String error = null;
try {
error = this.switchboard.sbStackCrawlThread.stackCrawl(
this.nextURL,
this.siteMapURL.toString(),
yacyCore.seedDB.mySeed.hash,
this.nextURL,
new Date(),
0,
this.crawlingProfile
);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (error != null) {
try {
plasmaCrawlZURL.Entry ee = this.switchboard.errorURL.newEntry(new URL(this.nextURL), error);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
this.urlCounter++;
}
}
}
public void characters(char[] buf, int offset, int len) throws SAXException {
if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LOC)) {
// TODO: we need to decode the URL here
this.nextURL =(new String(buf,offset,len)).trim();
if (!this.nextURL.startsWith("http")) {
System.out.println(this.nextURL);
}
}
// else if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LASTMOD)) {
// try {
// this.lastMod = this.dateFormater.parse(new String(buf,offset,len));
// } catch (ParseException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// }
}
private plasmaCrawlProfile.entry createProfile(String domainName, String sitemapURL) {
return this.switchboard.profiles.newEntry(
domainName,
sitemapURL,
// crawlingFilter
".*", ".*",
// Depth
0, 0,
// force recrawling
0,
// disable Auto-Dom-Filter
-1, -1,
// allow crawling of dynamic URLs
true,
// index text + media
true, true,
// don't store downloaded pages to Web Cache
false,
// store to TX cache
true,
// remote Indexing disabled
false,
// exclude stop-words
true, true, true
);
}
}

@ -137,7 +137,7 @@ public final class robotsParser{
// NEW: just ignore it
} else if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line
} else if (line.startsWith(ROBOTS_SITEMAP)) {
} else if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
pos = line.indexOf(" ");
if (pos != -1) {
sitemap = line.substring(pos).trim();
@ -214,22 +214,55 @@ public final class robotsParser{
return new Object[]{denyList,sitemap};
}
public static boolean isDisallowed(URL nexturl) {
if (nexturl == null) throw new IllegalArgumentException();
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = null;
int port = nexturl.getPort();
private static final int getPort(URL theURL) {
int port = theURL.getPort();
if (port == -1) {
if (nexturl.getProtocol().equalsIgnoreCase("http")) {
if (theURL.getProtocol().equalsIgnoreCase("http")) {
port = 80;
} else if (nexturl.getProtocol().equalsIgnoreCase("https")) {
} else if (theURL.getProtocol().equalsIgnoreCase("https")) {
port = 443;
}
}
urlHostPort = nexturl.getHost() + ":" + port;
urlHostPort = urlHostPort.toLowerCase().intern();
return port;
}
private static final String getHostPort(URL theURL) {
String urlHostPort = null;
int port = getPort(theURL);
urlHostPort = theURL.getHost() + ":" + port;
urlHostPort = urlHostPort.toLowerCase().intern();
return urlHostPort;
}
public static URL getSitemapURL(URL theURL) {
if (theURL == null) throw new IllegalArgumentException();
URL sitemapURL = null;
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = getHostPort(theURL);
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
synchronized(urlHostPort) {
// doing a DB lookup to determine if the robots data is already available
robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort);
}
if (robotsTxt4Host == null) return null;
try {
String sitemapUrlStr = robotsTxt4Host.getSitemap();
if (sitemapUrlStr != null) sitemapURL = new URL(sitemapUrlStr);
} catch (MalformedURLException e) {/* ignore this */}
return sitemapURL;
}
public static boolean isDisallowed(URL nexturl) {
if (nexturl == null) throw new IllegalArgumentException();
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = getHostPort(nexturl);
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
synchronized(urlHostPort) {
@ -245,7 +278,7 @@ public final class robotsParser{
URL robotsURL = null;
// generating the proper url to download the robots txt
try {
robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),port,"/robots.txt");
robotsURL = new URL(nexturl.getProtocol(),nexturl.getHost(),getPort(nexturl),"/robots.txt");
} catch (MalformedURLException e) {
serverLog.logSevere("ROBOTS","Unable to generate robots.txt URL for URL '" + nexturl.toString() + "'.");
return false;

@ -1,19 +1,18 @@
package de.anomic.plasma.dbImport;
import java.io.File;
import java.util.HashMap;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.logging.serverLog;
public abstract class AbstractImporter extends Thread implements dbImporter{
protected int jobID;
protected int jobID = -1;
protected String jobType;
protected serverLog log;
protected boolean stopped = false;
protected boolean paused = false;
protected File importPrimaryPath, importSecondaryPath;
protected int cacheSize;
protected long preloadTime;
@ -22,29 +21,26 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
protected long globalPauseLast;
protected long globalPauseDuration;
protected String error;
protected plasmaWordIndex wi;
public AbstractImporter(plasmaWordIndex wi) {
//super(theSb.dbImportManager.runningJobs,"");
this.wi = wi;
protected plasmaSwitchboard sb;
AbstractImporter(String theJobType, plasmaSwitchboard switchboard) {
super(switchboard.dbImportManager.runningJobs,"");
this.jobType = theJobType;
this.sb = switchboard;
}
public String getError() {
return this.error;
}
public void init(File thePrimaryPath, File theSecondaryPath) {
if (thePrimaryPath == null) throw new NullPointerException("The Primary Import path must not be null.");
if (theSecondaryPath == null) throw new NullPointerException("The Secondary Import path must not be null.");
this.importPrimaryPath = thePrimaryPath;
this.importSecondaryPath = theSecondaryPath;
// getting a job id from the import manager
//this.jobID = this.sb.dbImportManager.getJobID();
/**
* @see dbImporter#init(HashMap)
*/
public void init(HashMap initparams) throws ImporterException {
// initializing the logger and setting a more verbose thread name
this.log = new serverLog("IMPORT_" + this.jobType + "_" + this.jobID);
this.setName("IMPORT_" + this.jobType /*+ "_" + this.sb.dbImportManager.getJobID()*/);
this.setName("IMPORT_" + this.jobType + "_" + this.jobID);
}
public void startIt() {
@ -101,6 +97,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
return this.jobID;
}
public void setJobID(int id) {
if (this.jobID != -1) throw new IllegalStateException("job ID already assigned");
this.jobID = id;
}
public long getTotalRuntime() {
return (this.globalEnd == 0)?System.currentTimeMillis()-(this.globalStart+this.globalPauseDuration):this.globalEnd-(this.globalStart+this.globalPauseDuration);
}
@ -117,13 +118,6 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
return this.jobType;
}
public File getPrimaryImportPath() {
return this.importPrimaryPath;
}
public File getSecondaryImportPath() {
return this.importSecondaryPath;
}
public abstract long getEstimatedTime();
public abstract String getJobName();
public abstract int getProcessingStatusPercent();

@ -0,0 +1,11 @@
package de.anomic.plasma.dbImport;
public class ImporterException extends Exception {
public ImporterException(String message) {
super(message);
}
public ImporterException(String message, Throwable cause) {
super(message, cause);
}
}

@ -0,0 +1,133 @@
//AbstractParser.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2007
//
//this file was contributed by Martin Thelian
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.plasma.dbImport;
import java.util.HashMap;
import de.anomic.data.SitemapParser;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
public class SitemapImporter extends AbstractImporter implements dbImporter {
private SitemapParser parser = null;
private URL sitemapURL = null;
public SitemapImporter(plasmaSwitchboard switchboard) {
super("sitemap",switchboard);
}
public long getEstimatedTime() {
long t = getElapsedTime();
int p = getProcessingStatusPercent();
return (p==0)?0:(t/p)*(100-p);
}
/**
* @see dbImporter#getJobName()
*/
public String getJobName() {
return this.sitemapURL.toString();
}
/**
* @see dbImporter#getProcessingStatusPercent()
*/
public int getProcessingStatusPercent() {
if (this.parser == null) return 0;
long total = this.parser.getTotalLength();
long processed = this.parser.getProcessedLength();
if (total <= 1) return 0;
return (int) ((processed*100)/ total);
}
/**
* @see dbImporter#getStatus()
*/
public String getStatus() {
StringBuffer theStatus = new StringBuffer();
theStatus.append("#URLs=").append((this.parser==null)?0:this.parser.getUrlcount());
return theStatus.toString();
}
/**
* @see dbImporter#init(HashMap)
* @see AbstractImporter#init(HashMap)
*/
public void init(HashMap initParams) throws ImporterException {
super.init(initParams);
if (initParams == null || initParams.size() == 0) throw new IllegalArgumentException("Init parameters are missing");
if (!initParams.containsKey("crawlingProfile")) throw new IllegalArgumentException("Init parameters 'crawlingProfile' is missing");
if (!initParams.containsKey("sitemapURL")) throw new IllegalArgumentException("Init parameters 'sitemapURL' is missing");
try {
// getting the sitemap URL
this.sitemapURL = new URL((String)initParams.get("sitemapURL"));
// getting the crawling profile to use
plasmaCrawlProfile.entry profileEntry = this.sb.profiles.getEntry((String)initParams.get("crawlingProfile"));
// creating the sitemap parser
this.parser = new SitemapParser(this.sb,this.sitemapURL,profileEntry);
} catch (Exception e) {
throw new ImporterException("Unable to initialize Importer",e);
}
}
public void run() {
try {
this.parser.parse();
} finally {
this.globalEnd = System.currentTimeMillis();
this.sb.dbImportManager.finishedJobs.add(this);
}
}
}

@ -16,18 +16,18 @@ public class dbImportManager {
this.sb = theSb;
}
public int getJobID() {
private int generateUniqueJobID() {
int jobID;
synchronized(runningJobs) {
jobID = currMaxJobNr;
currMaxJobNr++;
synchronized(this.runningJobs) {
jobID = this.currMaxJobNr;
this.currMaxJobNr++;
}
return jobID;
}
public dbImporter[] getRunningImporter() {
Thread[] importThreads = new Thread[runningJobs.activeCount()*2];
int activeCount = runningJobs.enumerate(importThreads);
Thread[] importThreads = new Thread[this.runningJobs.activeCount()*2];
int activeCount = this.runningJobs.enumerate(importThreads);
dbImporter[] importers = new dbImporter[activeCount];
for (int i=0; i<activeCount; i++) {
importers[i] = (dbImporter) importThreads[i];
@ -36,7 +36,7 @@ public class dbImportManager {
}
public dbImporter[] getFinishedImporter() {
return (dbImporter[]) finishedJobs.toArray(new dbImporter[finishedJobs.size()]);
return (dbImporter[]) this.finishedJobs.toArray(new dbImporter[this.finishedJobs.size()]);
}
public dbImporter getImporterByID(int jobID) {
@ -57,10 +57,18 @@ public class dbImportManager {
if (type == null) return null;
if (type.length() == 0) return null;
// create a new importer thread
dbImporter newImporter = null;
if (type.equalsIgnoreCase("NURL")) {
newImporter = new plasmaCrawlNURLImporter(this.sb);
} else if (type.equalsIgnoreCase("sitemap")) {
newImporter = new SitemapImporter(this.sb);
}
// assign a job ID to it
newImporter.setJobID(this.generateUniqueJobID());
// return the newly created importer
return newImporter;
}
@ -86,7 +94,7 @@ public class dbImportManager {
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) {
((plasmaDbImporter)currentThread).stopIt();
((dbImporter)currentThread).stopIt();
}
}

@ -1,6 +1,7 @@
package de.anomic.plasma.dbImport;
import java.io.File;
import java.util.HashMap;
public interface dbImporter {
@ -18,13 +19,12 @@ public interface dbImporter {
public int getProcessingStatusPercent();
public int getJobID();
public void setJobID(int id);
public String getJobName();
public String getJobType();
public File getPrimaryImportPath();
public File getSecondaryImportPath();
public String getError();
public String getStatus();
public void init(File plasmaPath, File indexPrimaryPath, File indexSecondaryPath, int cacheSize, long preloadTime);
public String getStatus();
//public void init(File plasmaPath, File indexPrimaryPath, File indexSecondaryPath, int cacheSize, long preloadTime);
public void init(HashMap initParams) throws ImporterException;
public void startIt();
}

@ -2,6 +2,7 @@ package de.anomic.plasma.dbImport;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeMap;
@ -10,20 +11,22 @@ import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImporter {
private File plasmaPath = null;
private HashSet importProfileHandleCache = new HashSet();
private plasmaCrawlProfile importProfileDB;
private plasmaCrawlNURL importNurlDB;
private plasmaWordIndex wi;
private int importStartSize;
private int urlCount = 0;
private int profileCount = 0;
private plasmaSwitchboard sb;
public plasmaCrawlNURLImporter(plasmaSwitchboard theSb) {
super(theSb.wordIndex);
this.jobType="NURL";
super("NURL",theSb);
this.wi = this.sb.wordIndex;
}
public long getEstimatedTime() {
@ -31,7 +34,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
}
public String getJobName() {
return this.importPrimaryPath.toString();
return this.plasmaPath.toString();
}
public int getProcessingStatusPercent() {
@ -47,10 +50,21 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
return theStatus.toString();
}
public void init(File plasmaPath, File indexPrimary, File indexSecondary, int theCacheSize, long preloadTime) {
super.init(indexPrimary, indexSecondary);
this.cacheSize = theCacheSize;
this.preloadTime = preloadTime;
public void init(HashMap initParams) throws ImporterException {
super.init(initParams);
if (initParams == null || initParams.size() == 0) throw new IllegalArgumentException("Init parameters are missing");
if (!initParams.containsKey("plasmaPath")) throw new IllegalArgumentException("Init parameters 'plasmaPath' is missing");
if (!initParams.containsKey("cacheSize")) throw new IllegalArgumentException("Init parameters 'cacheSize' is missing");
if (!initParams.containsKey("preloadTime")) throw new IllegalArgumentException("Init parameters 'preloadTime' is missing");
// TODO: we need more errorhandling here
this.plasmaPath = new File((String)initParams.get("plasmaPath"));
this.cacheSize = Integer.valueOf((String)initParams.get("cacheSize")).intValue();
if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024;
this.preloadTime = Long.valueOf((String)initParams.get("preloadTime")).longValue();
File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db");
File profileDbFile = new File(plasmaPath, "crawlProfiles0.db");

@ -1,6 +1,7 @@
package de.anomic.plasma.dbImport;
import java.io.File;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;
@ -9,12 +10,23 @@ import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDate;
public class plasmaDbImporter extends AbstractImporter implements dbImporter {
private File importPrimaryPath, importSecondaryPath;
/**
* the source word index (the DB to import)
*/
private plasmaWordIndex importWordIndex;
/**
* the destination word index (the home DB)
*/
protected plasmaWordIndex homeWordIndex;
private int importStartSize;
private String wordHash = "------------";
@ -24,16 +36,22 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
private long urlCounter = 0, wordCounter = 0, entryCounter = 0, notBoundEntryCounter = 0;
public plasmaDbImporter(plasmaWordIndex homeWI, plasmaWordIndex importWI) {
super(homeWI);
public plasmaDbImporter(plasmaSwitchboard sb, plasmaWordIndex homeWI, plasmaWordIndex importWI) {
super("PLASMADB",sb);
this.homeWordIndex = homeWI;
this.importWordIndex = importWI;
this.jobType = "PLASMADB";
}
/**
* @see dbImporter#getJobName()
*/
public String getJobName() {
return this.importPrimaryPath.toString();
}
/**
* @see dbImporter#getStatus()
*/
public String getStatus() {
StringBuffer theStatus = new StringBuffer();
@ -46,12 +64,29 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
return theStatus.toString();
}
public void init(File plasmaPath, File thePrimaryPath, File theSecondaryPath, int theCacheSize, long preloadTime) {
super.init(thePrimaryPath, theSecondaryPath);
//public void init(File thePrimaryPath, File theSecondaryPath, int theCacheSize, long preloadTime) {
/**
* @throws ImporterException
* @see dbImporter#init(HashMap)
*/
public void init(HashMap initParams) throws ImporterException {
super.init(initParams);
if (initParams == null || initParams.size() == 0) throw new IllegalArgumentException("Init parameters are missing");
if (!initParams.containsKey("primaryPath")) throw new IllegalArgumentException("Init parameters 'primaryPath' is missing");
if (!initParams.containsKey("secondaryPath")) throw new IllegalArgumentException("Init parameters 'secondaryPath' is missing");
if (!initParams.containsKey("cacheSize")) throw new IllegalArgumentException("Init parameters 'cacheSize' is missing");
if (!initParams.containsKey("preloadTime")) throw new IllegalArgumentException("Init parameters 'preloadTime' is missing");
// TODO: we need more errorhandling here
this.importPrimaryPath = new File((String)initParams.get("primaryPath"));
this.importSecondaryPath = new File((String)initParams.get("secondaryPath"));
this.cacheSize = theCacheSize;
this.cacheSize = Integer.valueOf((String)initParams.get("cacheSize")).intValue();
if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024;
this.preloadTime = Long.valueOf((String)initParams.get("preloadTime")).longValue();
// configure import DB
String errorMsg = null;
if (!this.importPrimaryPath.exists()) errorMsg = "Primary Import directory does not exist.";
@ -72,7 +107,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
this.log.logFine("Initializing source word index db.");
this.importWordIndex = new plasmaWordIndex(this.importPrimaryPath, importSecondaryPath, preloadTime / 2, this.log);
this.importWordIndex = new plasmaWordIndex(this.importPrimaryPath, this.importSecondaryPath, preloadTime / 2, this.log);
this.importStartSize = this.importWordIndex.size();
}
@ -86,6 +121,9 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
}
/**
* @see dbImporter#getProcessingStatusPercent()
*/
public int getProcessingStatusPercent() {
// thid seems to be better:
// (this.importStartSize-this.importWordIndex.size())*100/((this.importStartSize==0)?1:this.importStartSize);
@ -94,6 +132,9 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
return (int)(this.wordCounter)/((this.importStartSize<100)?1:(this.importStartSize)/100);
}
/**
* @see dbImporter#getElapsedTime()
*/
public long getEstimatedTime() {
return (this.wordCounter==0)?0:((this.importStartSize*getElapsedTime())/this.wordCounter)-getElapsedTime();
}
@ -103,7 +144,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
try {
this.log.logInfo("Importing DB from '" + this.importPrimaryPath.getAbsolutePath() + "'/'" + this.importSecondaryPath.getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs.");
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.loadedURL.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs.");
HashSet unknownUrlBuffer = new HashSet();
@ -156,7 +197,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
if (urlEntry != null) {
/* write it into the home url db */
wi.loadedURL.store(urlEntry);
homeWordIndex.loadedURL.store(urlEntry);
importedUrlBuffer.add(urlHash);
this.urlCounter++;
@ -178,7 +219,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
if (isAborted()) break;
// importing entity container to home db
if (newContainer.size() > 0) { wi.addEntries(newContainer, System.currentTimeMillis(), false); }
if (newContainer.size() > 0) { homeWordIndex.addEntries(newContainer, System.currentTimeMillis(), false); }
// delete complete index entity file
this.importWordIndex.deleteContainer(this.wordHash);
@ -198,7 +239,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
"Speed: "+ 500*1000/duration + " word entities/s" +
" | Elapsed time: " + serverDate.intervalToString(getElapsedTime()) +
" | Estimated time: " + serverDate.intervalToString(getEstimatedTime()) + "\n" +
"Home Words = " + wi.size() +
"Home Words = " + homeWordIndex.size() +
" | Import Words = " + this.importWordIndex.size());
this.wordChunkStart = this.wordChunkEnd;
this.wordChunkStartHash = this.wordChunkEndHash;
@ -221,7 +262,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
}
this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs.");
this.log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeWordIndex.loadedURL.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs.");
} catch (Exception e) {
this.log.logSevere("Database import failed.",e);

@ -200,7 +200,7 @@ public class plasmaCrawlRobotsTxt {
}
public String getSitemap() {
return this.mem.containsKey(SITEMAP)? (String)this.mem.get(LOADED_DATE): null;
return this.mem.containsKey(SITEMAP)? (String)this.mem.get(SITEMAP): null;
}
public Date getLoadedDate() {

Loading…
Cancel
Save