*) adding experimental support for parsing of bookmarksfiles

See: http://www.yacy-forum.de/viewtopic.php?t=177

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@388 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent f57b60cd60
commit 5c3822d5f4

@ -102,14 +102,27 @@ You can define URLs as start points for Web page crawling and start that crawlin
</tr>
-->
<tr valign="top" class="TableCellLight">
<td class=small>Start Point:</td>
<td class=small colspan="2"><input name="crawlingURL" type="text" size="42" maxlength="256" value="http://"></td>
<td class=small><input type="submit" name="crawlingstart" value="Start New Crawl"></td>
<td class=small>Existing start URL's are re-crawled.
<td class="small" rowspan="3">Starting Point:</td>
<td class="small">
<table cellpadding="0" cellspacing="0">
<tr><td class="small">From&nbsp;File:</td>
<td class="small"><input type="radio" name="crawlingMode" value="file"></td>
<td class="small"><input type="file" name="crawlingFile" size="28"></td>
</tr>
<tr><td class="small">From&nbsp;URL:</td>
<td class="small"><input type="radio" name="crawlingMode" value="url" checked="checked"></td>
<td class="small"><input name="crawlingURL" type="text" size="41" maxlength="256" value="http://"></td>
</tr>
</table>
</td>
<td class=small colspan="3" rowspan="2">Existing start URL's are re-crawled.
Other already visited URL's are sorted out as 'double'.
A complete re-crawl will be available soon.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td class=small colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl"></td>
</tr>
</form>
</table>
</p>
@ -130,19 +143,21 @@ Your peer can search and index for other peers and they can search for you.</div
<p>
#(error)#
::
#(error)#<!-- 0 -->
::<!-- 1 -->
Error with profile management. Please stop yacy, delete the File DATA/PLASMADB/crawlProfiles0.db and restart.
::
::<!-- 2 -->
Error: #[errmsg]#
::
::<!-- 3 -->
Application not yet initialized. Sorry. Please wait some seconds and repeat the request.
::
::<!-- 4 -->
<b>ERROR: Crawl filter "#[newcrawlingfilter]#" does not match with crawl root "#[crawlingStart]#".</b> Please try again with different filter</p><br>
::
::<!-- 5 -->
Crawling of "#[crawlingURL]#" failed. Reason: #[reasonString]#<br>
::
::<!-- 6 -->
Error with url input "#[crawlingStart]#": #[error]#
::<!-- 7 -->
Error with file input "#[crawlingStart]#": #[error]#
#(/error)#
<br>
#(info)#

@ -43,20 +43,30 @@
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
import java.io.File;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
@ -104,6 +114,8 @@ public class IndexCreate_p {
boolean xpstopw = ((String) post.get("xpstopw", "")).equals("on");
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
String crawlingMode = post.get("crawlingMode","url");
if (crawlingMode.equals("url")) {
String crawlingStart = (String) post.get("crawlingURL");
if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;
@ -148,6 +160,67 @@ public class IndexCreate_p {
prop.put("error_error", e.getMessage());
e.printStackTrace();
}
} else if (crawlingMode.equals("file")) {
if (post.containsKey("crawlingFile")) {
// getting the name of the uploaded file
String fileName = (String) post.get("crawlingFile");
try {
File file = new File(fileName);
// getting the content of the bookmark file
byte[] fileContent = (byte[]) post.get("crawlingFile$file");
// parsing the bookmark file and fetching the headline and contained links
htmlFilterContentScraper scraper = new htmlFilterContentScraper(file.toURL());
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.write(fileContent,os);
os.close();
String headline = scraper.getHeadline();
HashMap hyperlinks = (HashMap) scraper.getAnchors();
// creating a crawler profile
plasmaCrawlProfile.entry profile = switchboard.profiles.newEntry(fileName, file.toURL().toString(), newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingQ, storeHTCache, true, localIndexing, crawlOrder, xsstopw, xdstopw, xpstopw);
// loop through the contained links
Iterator interator = hyperlinks.entrySet().iterator();
int c = 0;
while (interator.hasNext()) {
Map.Entry e = (Map.Entry) interator.next();
String nexturlstring = (String) e.getKey();
// generating an url object
URL nexturlURL = null;
try {
nexturlURL = new URL(nexturlstring);
} catch (MalformedURLException ex) {
nexturlURL = null;
c++;
continue;
}
// enqueuing the url for crawling
String rejectReason = switchboard.stackCrawl(nexturlstring, null, yacyCore.seedDB.mySeed.hash, (String)e.getValue(), new Date(), 1, profile);
// if something failed add the url into the errorURL list
if (rejectReason == null) {
c++;
} else {
switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
}
}
} catch (Exception e) {
// mist
prop.put("error", 7);//Error with file
prop.put("error_crawlingStart", fileName);
prop.put("error_error", e.getMessage());
e.printStackTrace();
}
}
}
}
}

Loading…
Cancel
Save