added option to distinguish between text crawl and media crawl

- for each crawl start, there is now a flag for text and media
- the localCrawl flag is superfluous
- added new crawl profiles
- if an image search is done, only media links are crawled for the snippets


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3100 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 6866bcd0e0
commit 61798f0ae6

@ -151,7 +151,7 @@ public class Bookmarks {
plasmaParserDocument document = null;
if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp();
document = switchboard.snippetCache.retrieveDocument(comp.url(), true, 5000);
document = switchboard.snippetCache.retrieveDocument(comp.url(), true, 5000, true);
prop.put("mode_edit", 0); // create mode
prop.put("mode_url", comp.url().toNormalform());
prop.put("mode_title", comp.descr());

@ -106,7 +106,8 @@
</tr>
<tr valign="top" class="TableCellDark">
<td>Do Local Indexing:</td>
<td><input type="checkbox" name="localIndexing" #(localIndexingChecked)#::checked="checked"#(/localIndexingChecked)# /></td>
<td>index text:<input type="checkbox" name="indexText" #(indexingTextChecked)#::checked="checked"#(/indexingTextChecked)# />&nbsp;&nbsp;&nbsp;
index media:<input type="checkbox" name="indexMedia" #(indexingMediaChecked)#::checked="checked"#(/indexingMediaChecked)# /></td>
<td>
This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
<a href="CacheAdmin_p.html">Proxy Cache</a> without indexing.

@ -141,7 +141,8 @@ public class IndexCreate_p {
prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
prop.put("crawlingQChecked", env.getConfig("crawlingQ", "").equals("true") ? 1 : 0);
prop.put("storeHTCacheChecked", env.getConfig("storeHTCache", "").equals("true") ? 1 : 0);
prop.put("localIndexingChecked", env.getConfig("localIndexing", "").equals("true") ? 1 : 0);
prop.put("indexingTextChecked", env.getConfig("indexText", "").equals("true") ? 1 : 0);
prop.put("indexingMediaChecked", env.getConfig("indexMedia", "").equals("true") ? 1 : 0);
prop.put("crawlOrderChecked", env.getConfig("crawlOrder", "").equals("true") ? 1 : 0);
long busySleep = Integer.parseInt(env.getConfig("62_remotetriggeredcrawl_busysleep", "100"));
if (busySleep < 100) {

@ -15,7 +15,7 @@
If you click on it while browsing, the currently viewed website will be inserted into the YaCy crawling queue for indexing.
</p>
<p>
<a class="BookmarkLink" href="javascript:w = window.open('http://#[host]#:#[port]#/QuickCrawlLink_p.html?localIndexing=on&amp;crawlingQ=on&amp;xdstopw=on&amp;title='+escape(document.title)+'&amp;url='+escape(location.href),'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();">Crawl with YaCy</a>
<a class="BookmarkLink" href="javascript:w = window.open('http://#[host]#:#[port]#/QuickCrawlLink_p.html?indexText=on&indexMedia=on&amp;crawlingQ=on&amp;xdstopw=on&amp;title='+escape(document.title)+'&amp;url='+escape(location.href),'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();">Crawl with YaCy</a>
</p>
::<!-- 1 -->

@ -66,7 +66,7 @@ public class QuickCrawlLink_p {
/**
* Example Javascript to call this servlet:
* <code>javascript:w = window.open('http://user:pwd@localhost:8080/QuickCrawlLink_p.html?localIndexing=on&crawlingQ=on&xdstopw=on&title=' + escape(document.title) + '&url=' + location.href,'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no'); w.focus();</code>
* <code>javascript:w = window.open('http://user:pwd@localhost:8080/QuickCrawlLink_p.html?indexText=on&indexMedia=on&crawlingQ=on&xdstopw=on&title=' + escape(document.title) + '&url=' + location.href,'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no'); w.focus();</code>
* @param header
* @param post
* @param env
@ -114,8 +114,9 @@ public class QuickCrawlLink_p {
String crawlingFilter = post.get("crawlingFilter", ".*");
int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0"));
boolean crawlDynamic = post.get("crawlingQ", "").equals("on");
boolean indexText = post.get("indexText", "on").equals("on");
boolean indexMedia = post.get("indexMedia", "on").equals("on");
boolean storeHTCache = post.get("storeHTCache", "").equals("on");
boolean localIndexing = post.get("localIndexing", "").equals("on");
boolean remoteIndexing = post.get("crawlOrder", "").equals("on");
boolean xsstopw = post.get("xsstopw", "").equals("on");
boolean xdstopw = post.get("xdstopw", "").equals("on");
@ -166,13 +167,14 @@ public class QuickCrawlLink_p {
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // domFilterDepth, if negative: no auto-filter
-1, // domMaxPages, if negative: no count restriction
crawlDynamic,
crawlDynamic,
indexText,
indexMedia,
storeHTCache,
true,
localIndexing,
remoteIndexing,
xsstopw,
xdstopw,
true,
remoteIndexing,
xsstopw,
xdstopw,
xpstopw
);
} catch (Exception e) {

@ -171,7 +171,7 @@ public class ViewFile {
if (resource == null) {
plasmaHTCache.Entry entry = null;
try {
entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false, true);
} catch (plasmaCrawlerException e) {
prop.put("error", 4);
prop.put("error_errorText", e.getMessage());

@ -74,7 +74,7 @@ public class ViewImage {
int timeout = post.getInt("timeout", 5000);
// getting the image as stream
Object[] resource = sb.snippetCache.getResource(url, true, timeout);
Object[] resource = sb.snippetCache.getResource(url, true, timeout, false);
if (resource == null) return null;
InputStream imgStream = (InputStream) resource[0];
if (imgStream == null) return null;

@ -118,7 +118,8 @@
<td><strong>Max Page Per Domain</strong></td>
<td><strong>Accept '?' URLs</strong></td>
<td><strong>Fill Proxy Cache</strong></td>
<td><strong>Local Indexing</strong></td>
<td><strong>Local Text Indexing</strong></td>
<td><strong>Local Media Indexing</strong></td>
<td><strong>Remote Indexing</strong></td>
<td></td>
</tr>
@ -134,7 +135,8 @@
<td>#[crawlingDomMaxPages]#</td>
<td>#(withQuery)#no::yes#(/withQuery)#</td>
<td>#(storeCache)#no::yes#(/storeCache)#</td>
<td>#(localIndexing)#no::yes#(/localIndexing)#</td>
<td>#(indexText)#no::yes#(/indexText)#</td>
<td>#(indexMedia)#no::yes#(/indexMedia)#</td>
<td>#(remoteIndexing)#no::yes#(/remoteIndexing)#</td>
<td>#(deleteButton)#::<form action="WatchCrawler_p.html" method="get" enctype="multipart/form-data"><input type="hidden" name="handle" value="#[handle]#" /><input type="submit" name="deleteprofile" value="Delete" /></form>#(/deleteButton)#</td>
</tr>
@ -147,6 +149,7 @@
<tbody>
<tr class="TableHeader">
<th>Queue</th>
<th>Profile</th>
<th>Initiator</th>
<th>Depth</th>
<th>Modified Date</th>

@ -102,10 +102,10 @@ public class WatchCrawler_p {
boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
boolean indexText = post.get("indexText", "on").equals("on");
boolean indexText = post.get("indexText", "off").equals("on");
env.setConfig("indexText", (indexText) ? "true" : "false");
boolean indexMedia = post.get("indexMedia", "on").equals("on");
boolean indexMedia = post.get("indexMedia", "off").equals("on");
env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
@ -181,7 +181,8 @@ public class WatchCrawler_p {
if (crawlOrder) {
Map m = new HashMap(pe.map()); // must be cloned
m.remove("specificDepth");
m.remove("localIndexing");
m.remove("indexText");
m.remove("indexMedia");
m.remove("remoteIndexing");
m.remove("xsstopw");
m.remove("xpstopw");
@ -328,7 +329,8 @@ public class WatchCrawler_p {
prop.put("crawlProfiles_"+count+"_remoteIndexing", ((profile.remoteIndexing()) ? 1 : 0));
prop.put("crawlProfiles_"+count+"_deleteButton", (((profile.name().equals("remote")) ||
(profile.name().equals("proxy")) ||
(profile.name().equals("snippet"))) ? 0 : 1));
(profile.name().equals("snippetText")) ||
(profile.name().equals("snippetMedia")) ? 0 : 1)));
prop.put("crawlProfiles_"+count+"_deleteButton_handle", profile.handle());
dark = !dark;

@ -145,6 +145,7 @@ function updateTable(indexingqueue, tablename){
dark=false;
for(i=0;i<entries.length;i++){
profile=getValue(getFirstChild(entries[i], "profile"));
initiator=getValue(getFirstChild(entries[i], "initiator"));
depth=getValue(getFirstChild(entries[i], "depth"));
modified=getValue(getFirstChild(entries[i], "modified"));
@ -160,7 +161,7 @@ function updateTable(indexingqueue, tablename){
deletebutton=createLinkCol("IndexCreateIndexingQueue_p.html?deleteEntry="+hash, DELETE_STRING);
else
deletebutton=createCol("");
row=createIndexingRow(tablename, initiator, depth, modified, anchor, url, size, deletebutton);
row=createIndexingRow(tablename, profile, initiator, depth, modified, anchor, url, size, deletebutton);
//create row
if(inProcess){
@ -175,10 +176,11 @@ function updateTable(indexingqueue, tablename){
}
}
function createIndexingRow(queue, initiator, depth, modified, anchor, url, size, deletebutton){
function createIndexingRow(queue, profile, initiator, depth, modified, anchor, url, size, deletebutton){
row=document.createElement("tr");
row.setAttribute("height", 10);
row.appendChild(createCol(queue));
row.appendChild(createCol(profile));
row.appendChild(createCol(initiator));
row.appendChild(createCol(depth));
row.appendChild(createCol(modified));

@ -115,6 +115,7 @@ public class queues_p {
totalSize += entrySize;
if ((pcentry != null)&&(pcentry.url() != null)) {
initiator = yacyCore.seedDB.getConnected(pcentry.initiator());
prop.put("list-indexing_"+i+"_profile", pcentry.profile().name());
prop.putNoHTML("list-indexing_"+i+"_initiator", ((initiator == null) ? "proxy" : wikiCode.replaceHTML(initiator.getName())));
prop.put("list-indexing_"+i+"_depth", pcentry.depth());
prop.put("list-indexing_"+i+"_modified", pcentry.getModificationDate());
@ -144,7 +145,7 @@ public class queues_p {
CrawlWorker theWorker = (CrawlWorker)threadList[i];
plasmaCrawlLoaderMessage theMsg = theWorker.theMsg;
if (theMsg == null) continue;
prop.put("list-loader_"+count+"_profile", theMsg.profile.name());
initiator = yacyCore.seedDB.getConnected(theMsg.initiator);
prop.putNoHTML("list-loader_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("list-loader_"+count+"_depth", theMsg.depth );
@ -183,6 +184,7 @@ public class queues_p {
urle = crawlerList[i];
if ((urle != null) && (urle.url() != null)) {
initiator = yacyCore.seedDB.getConnected(urle.initiator());
prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle());
prop.put(tableName + "_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put(tableName + "_" + showNum + "_depth", urle.depth());
prop.put(tableName + "_" + showNum + "_modified", daydate(urle.loaddate()));

@ -4,6 +4,7 @@
<max>#[indexingMax]#</max>
#{list-indexing}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<modified>#[modified]#</modified>
@ -20,6 +21,7 @@
<max>#[loaderMax]#</max>
#{list-loader}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<url>#[url]#</url>
@ -30,6 +32,7 @@
<size>#[localCrawlSize]#</size>
#{list-local}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<modified>#[modified]#</modified>
@ -44,6 +47,7 @@
<size>#[remoteCrawlSize]#</size>
#{list-remote}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<modified>#[modified]#</modified>

@ -26,6 +26,7 @@ public class snippet {
// getting url
String urlString = post.get("url", "");
URL url = new URL(urlString);
prop.put("urlHash",plasmaURL.urlHash(url));
// if 'remove' is set to true, then RWI references to URLs that do not have the snippet are removed
boolean remove = post.get("remove", "false").equals("true");
@ -33,11 +34,15 @@ public class snippet {
// boolean line_end_with_punctuation
boolean pre = post.get("pre", "false").equals("true");
// type of media
String media = post.get("media", "text");
String querystring = post.get("search", "").trim();
if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
}
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
Set queryHashes = plasmaCondenser.words2hashes(query);
// filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);
@ -46,36 +51,39 @@ public class snippet {
}
// find snippet
Set queryHashes = plasmaCondenser.words2hashes(query);
plasmaSnippetCache.TextSnippet snippet = switchboard.snippetCache.retrieveTextSnippet(url, queryHashes, true, pre, 260, 10000);
prop.put("status",snippet.getSource());
if (snippet.getSource() < 11) {
//prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
prop.put("text", (snippet.exists()) ? "<![CDATA["+snippet.getLineMarked(queryHashes)+"]]>" : "unknown");
if (media.equals("text")) {
// attach text snippet
plasmaSnippetCache.TextSnippet snippet = switchboard.snippetCache.retrieveTextSnippet(url, queryHashes, true, pre, 260, 10000);
prop.put("status",snippet.getSource());
if (snippet.getSource() < 11) {
//prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
prop.put("text", (snippet.exists()) ? "<![CDATA["+snippet.getLineMarked(queryHashes)+"]]>" : "unknown");
} else {
String error = snippet.getError();
if ((remove) && (error.equals("no matching snippet found"))) {
serverLog.logInfo("snippet-fetch", "no snippet found, remove words '" + querystring + "' for url = " + url.toNormalform());
switchboard.wordIndex.removeReferences(query, plasmaURL.urlHash(url));
}
prop.put("text", error);
}
prop.put("link", 0);
prop.put("links", 0);
} else {
String error = snippet.getError();
if ((remove) && (error.equals("no matching snippet found"))) {
serverLog.logInfo("snippet-fetch", "no snippet found, remove words '" + querystring + "' for url = " + url.toNormalform());
switchboard.wordIndex.removeReferences(query, plasmaURL.urlHash(url));
// attach media information
ArrayList mediaSnippets = switchboard.snippetCache.retrieveMediaSnippets(url, queryHashes, true, 1000);
plasmaSnippetCache.MediaSnippet ms;
for (int i = 0; i < mediaSnippets.size(); i++) {
ms = (plasmaSnippetCache.MediaSnippet) mediaSnippets.get(i);
prop.put("link_" + i + "_type", ms.type);
prop.put("link_" + i + "_href", ms.href);
prop.put("link_" + i + "_name", ms.name);
prop.put("link_" + i + "_attr", ms.attr);
}
prop.put("text", error);
}
prop.put("urlHash",plasmaURL.urlHash(url));
// attach link information
ArrayList mediaSnippets = switchboard.snippetCache.retrieveMediaSnippets(url, queryHashes, true, 1000);
plasmaSnippetCache.MediaSnippet ms;
for (int i = 0; i < mediaSnippets.size(); i++) {
ms = (plasmaSnippetCache.MediaSnippet) mediaSnippets.get(i);
prop.put("link_" + i + "_type", ms.type);
prop.put("link_" + i + "_href", ms.href);
prop.put("link_" + i + "_name", ms.name);
prop.put("link_" + i + "_attr", ms.attr);
System.out.println("DEBUG: " + mediaSnippets.size() + " ENTRIES IN MEDIA SNIPPET LINKS for url " + urlString);
prop.put("text", "");
prop.put("link", mediaSnippets.size());
prop.put("links", mediaSnippets.size());
}
System.out.println("DEBUG: " + mediaSnippets.size() + " ENTRIES IN MEDIA SNIPPET LINKS for url " + urlString);
prop.put("link", mediaSnippets.size());
prop.put("links", mediaSnippets.size());
// return rewrite properties

@ -222,7 +222,7 @@ public class yacysearch {
if (urlentry != null) {
indexURLEntry.Components comp = urlentry.comp();
plasmaParserDocument document;
document = sb.snippetCache.retrieveDocument(comp.url(), true, 5000);
document = sb.snippetCache.retrieveDocument(comp.url(), true, 5000, true);
if (document != null) {
// create a news message
HashMap map = new HashMap();

@ -114,97 +114,113 @@ public final class plasmaCondenser {
//public int RESULT_NUMB_TEXT_BYTES = -1;
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
public int RESULT_SIMI_WORDS = -1;
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public int RESULT_SIMI_SENTENCES = -1;
public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
public plasmaCondenser(plasmaParserDocument document, boolean addMedia) throws UnsupportedEncodingException {
public plasmaCondenser(plasmaParserDocument document, boolean indexText, boolean indexMedia) throws UnsupportedEncodingException {
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the approriate media flag
this(document.getText(), document.getCharset());
this.wordminsize = 3;
this.wordcut = 2;
this.words = new TreeMap();
this.sentences = new HashMap();
kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19
// construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainLongTitle
// phrase 2 is the MainShortTitle
// phrase 3 is the Document Abstract
// phrase 4 is the Document Author
// phrase 5 are the tags specified in document
// phrase 10 and above are the section headlines/titles (88 possible)
// phrase 98 is taken from the embedded anchor/hyperlinks description
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
Map.Entry entry;
if (indexText) {
createCondensement(document.getText(), document.getCharset());
kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19
// construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainLongTitle
// phrase 2 is the MainShortTitle
// phrase 3 is the Document Abstract
// phrase 4 is the Document Author
// phrase 5 are the tags specified in document
// phrase 10 and above are the section headlines/titles (88 possible)
// phrase 98 is taken from the embedded anchor/hyperlinks description
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(document.getMainLongTitle(), 1, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getAbstract(), 3, indexRWIEntryNew.flag_app_descr, wflags);
// missing: author!
// missing: tags!
String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {
insertTextToWords(titles[i], i + 10, indexRWIEntryNew.flag_app_emphasized, wflags);
}
insertTextToWords(document.getMainLongTitle(), 1, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getAbstract(), 3, indexRWIEntryNew.flag_app_descr, wflags);
// missing: author!
// missing: tags!
String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {
insertTextToWords(titles[i], i + 10, indexRWIEntryNew.flag_app_emphasized, wflags);
}
// anchors
Iterator i = document.getAnchors().entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 98, indexRWIEntryNew.flag_app_url, wflags);
insertTextToWords((String) entry.getValue(), 98, indexRWIEntryNew.flag_app_url, wflags);
// anchors
Iterator i = document.getAnchors().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 98, indexRWIEntryNew.flag_app_url, wflags);
insertTextToWords((String) entry.getValue(), 98, indexRWIEntryNew.flag_app_url, wflags);
}
} else {
this.RESULT_NUMB_WORDS = 0;
this.RESULT_DIFF_WORDS = 0;
this.RESULT_NUMB_SENTENCES = 0;
this.RESULT_DIFF_SENTENCES = 0;
}
// audio
i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, wflags);
}
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, wflags);
}
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, wflags);
}
// images
i = document.getImages().iterator();
htmlFilterImageEntry ientry;
while (i.hasNext()) {
ientry = (htmlFilterImageEntry) i.next();
insertTextToWords((String) ientry.url().toNormalform(), 99, flag_cat_hasimage, wflags);
insertTextToWords((String) ientry.alt(), 99, flag_cat_hasimage, wflags);
}
if (indexMedia) {
kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19
// audio
Iterator i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, wflags);
}
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, wflags);
}
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, wflags);
}
// images
i = document.getImages().iterator();
htmlFilterImageEntry ientry;
while (i.hasNext()) {
ientry = (htmlFilterImageEntry) i.next();
insertTextToWords((String) ientry.url().toNormalform(), 99, flag_cat_hasimage, wflags);
insertTextToWords((String) ientry.alt(), 99, flag_cat_hasimage, wflags);
}
// finally check all words for missing flag entry
i = words.entrySet().iterator();
wordStatProp wprop;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
wprop = (wordStatProp) entry.getValue();
if (wprop.flags == null) {
wprop.flags = (kelondroBitfield) wflags.clone();
words.put(entry.getKey(), wprop);
// finally check all words for missing flag entry
i = words.entrySet().iterator();
wordStatProp wprop;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
wprop = (wordStatProp) entry.getValue();
if (wprop.flags == null) {
wprop.flags = (kelondroBitfield) wflags.clone();
words.put(entry.getKey(), wprop);
}
}
}
}
@ -229,6 +245,8 @@ public final class plasmaCondenser {
wprop.flags.set(flagpos, true);
words.put(word, wprop);
pip++;
this.RESULT_NUMB_WORDS++;
this.RESULT_DIFF_WORDS++;
}
}
@ -282,6 +300,10 @@ public final class plasmaCondenser {
return words;
}
public Map sentences() {
return sentences;
}
public static class wordStatProp {
// object carries statistics for words and sentences
@ -534,14 +556,12 @@ public final class plasmaCondenser {
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
this.RESULT_NUMB_WORDS = allwordcounter;
this.RESULT_DIFF_WORDS = wordHandleCount;
this.RESULT_SIMI_WORDS = words.size();
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size();
}
public void print() {
String[] s = sentences();
String[] s = sentenceReconstruction();
// printout a reconstruction of the text
for (int i = 0; i < s.length; i++) {
@ -549,7 +569,7 @@ public final class plasmaCondenser {
}
}
public String[] sentences() {
private String[] sentenceReconstruction() {
// we reconstruct the word hashtable
// and order the entries by the number of the sentence
// this structure is only needed to reconstruct the text
@ -613,49 +633,6 @@ public final class plasmaCondenser {
return orderedSentences;
}
/*
public void writeMapToFile(File out) throws IOException {
Map.Entry entry;
String k;
String word;
Iterator it;
wordStatProp wsp;
Object[] orderedSentences = makeOrderedSentences();
// we reconstruct the word hashtable
// and sort the entries by the number of occurrences
// this structure is needed to print out a sorted list of words
TreeMap sortedWords = new TreeMap(); //kelondroNaturalOrder.naturalOrder
it = words.entrySet().iterator(); // enumerates the keys in ascending order
while (it.hasNext()) {
entry = (Map.Entry) it.next();
word = (String) entry.getKey();
wsp = (wordStatProp) entry.getValue();
sortedWords.put(intString(wsp.count, numlength) + intString(wsp.posInText, numlength), word);
}
// start writing of words and sentences
FileWriter writer = new FileWriter(out);
writer.write("\r\n");
it = sortedWords.entrySet().iterator(); // enumerates the keys in descending order
while (it.hasNext()) {
entry = (Map.Entry) it.next();
k = (String) entry.getKey();
writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " + ((String) entry.getValue()) + "\r\n");
}
for (int i = 0; i < orderedSentences.length; i++) {
if (orderedSentences[i] != null) {
writer.write("#S " + intString(i, numlength) + " ");
for (int j = 0; j < ((String[]) orderedSentences[i]).length; j++) {
writer.write(((String[]) orderedSentences[i])[j] + " ");
}
writer.write("\r\n");
}
}
writer.close();
}
*/
public final static boolean invisible(char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true;

@ -186,14 +186,18 @@ public class plasmaCrawlProfile {
int generalDepth, int specificDepth,
int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
boolean crawlingQ,
boolean indexText, boolean indexMedia,
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
boolean remoteIndexing,
boolean xsstopw, boolean xdstopw, boolean xpstopw) {
entry ne = new entry(name, startURL, generalFilter, specificFilter,
generalDepth, specificDepth,
recrawlIfOlder, domFilterDepth, domMaxPages,
crawlingQ, storeHTCache, storeTXCache, localIndexing, remoteIndexing,
crawlingQ,
indexText, indexMedia,
storeHTCache, storeTXCache,
remoteIndexing,
xsstopw, xdstopw, xpstopw);
try {
profileTable.set(ne.handle(), ne.map());
@ -254,8 +258,9 @@ public class plasmaCrawlProfile {
int generalDepth, int specificDepth,
int recrawlIfOlder /*minutes*/, int domFilterDepth, int domMaxPages,
boolean crawlingQ,
boolean indexText, boolean indexMedia,
boolean storeHTCache, boolean storeTXCache,
boolean localIndexing, boolean remoteIndexing,
boolean remoteIndexing,
boolean xsstopw, boolean xdstopw, boolean xpstopw) {
String handle = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, crawlProfileHandleLength);
mem = new HashMap();
@ -270,9 +275,10 @@ public class plasmaCrawlProfile {
mem.put("domFilterDepth", Integer.toString(domFilterDepth));
mem.put("domMaxPages", Integer.toString(domMaxPages));
mem.put("crawlingQ", (crawlingQ) ? "true" : "false"); // crawling of urls with '?'
mem.put("indexText", (indexText) ? "true" : "false");
mem.put("indexMedia", (indexMedia) ? "true" : "false");
mem.put("storeHTCache", (storeHTCache) ? "true" : "false");
mem.put("storeTXCache", (storeTXCache) ? "true" : "false");
mem.put("localIndexing", (localIndexing) ? "true" : "false");
mem.put("remoteIndexing", (remoteIndexing) ? "true" : "false");
mem.put("xsstopw", (xsstopw) ? "true" : "false"); // exclude static stop-words
mem.put("xdstopw", (xdstopw) ? "true" : "false"); // exclude dynamic stop-word
@ -376,6 +382,14 @@ public class plasmaCrawlProfile {
String r = (String) mem.get("crawlingQ");
if (r == null) return false; else return (r.equals("true"));
}
public boolean indexText() {
String r = (String) mem.get("indexText");
if (r == null) return false; else return (r.equals("true"));
}
public boolean indexMedia() {
String r = (String) mem.get("indexMedia");
if (r == null) return false; else return (r.equals("true"));
}
public boolean storeHTCache() {
String r = (String) mem.get("storeHTCache");
if (r == null) return false; else return (r.equals("true"));
@ -384,10 +398,6 @@ public class plasmaCrawlProfile {
String r = (String) mem.get("storeTXCache");
if (r == null) return false; else return (r.equals("true"));
}
public boolean localIndexing() {
String r = (String) mem.get("localIndexing");
if (r == null) return false; else return (r.equals("true"));
}
public boolean remoteIndexing() {
String r = (String) mem.get("remoteIndexing");
if (r == null) return false; else return (r.equals("true"));

@ -61,7 +61,7 @@ public final class plasmaSearchImages {
long start = System.currentTimeMillis();
this.images = new TreeSet();
if (maxTime > 10) {
Object[] resource = sc.getResource(url, true, (int) maxTime);
Object[] resource = sc.getResource(url, true, (int) maxTime, false);
InputStream res = (InputStream) resource[0];
Long resLength = (Long) resource[1];
if (res != null) {

@ -256,7 +256,7 @@ public class plasmaSnippetCache {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout, true);
plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout, true, true);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
@ -341,7 +341,7 @@ public class plasmaSnippetCache {
* @param fetchOnline specifies if the resource should be loaded from web if it'as not available in the cache
* @return the parsed document as {@link plasmaParserDocument}
*/
public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline, int timeout) {
public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline, int timeout, boolean forText) {
// load resource
long resContentLength = 0;
@ -357,7 +357,7 @@ public class plasmaSnippetCache {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout, true);
plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout, true, forText);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
@ -593,7 +593,7 @@ public class plasmaSnippetCache {
return new ArrayList();
}
plasmaParserDocument document = retrieveDocument(url, fetchOnline, timeout);
plasmaParserDocument document = retrieveDocument(url, fetchOnline, timeout, false);
ArrayList a = new ArrayList();
if (document != null) {
a.addAll(computeMediaSnippets(document, queryhashes, "audio"));
@ -783,7 +783,7 @@ public class plasmaSnippetCache {
* <tr><td>[1]</td><td>the content-length as {@link Integer}</td></tr>
* </table>
*/
public Object[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
public Object[] getResource(URL url, boolean fetchOnline, int socketTimeout, boolean forText) {
// load the url as resource from the web
try {
long contentLength = -1;
@ -796,7 +796,7 @@ public class plasmaSnippetCache {
// if the content is not available in cache try to download it from web
// try to download the resource using a crawler
plasmaHTCache.Entry entry = loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true);
plasmaHTCache.Entry entry = loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true, forText);
// read resource body (if it is there)
byte[] resourceArray = entry.cacheArray();
@ -821,7 +821,8 @@ public class plasmaSnippetCache {
public plasmaHTCache.Entry loadResourceFromWeb(
URL url,
int socketTimeout,
boolean keepInMemory
boolean keepInMemory,
boolean forText
) throws plasmaCrawlerException {
plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync(
@ -830,7 +831,7 @@ public class plasmaSnippetCache {
null, // referer
yacyCore.seedDB.mySeed.hash, // initiator
0, // depth
sb.defaultSnippetProfile, // crawl profile
(forText) ? sb.defaultTextSnippetProfile : sb.defaultMediaSnippetProfile, // crawl profile
socketTimeout,
keepInMemory
);

@ -221,7 +221,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public plasmaCrawlProfile profiles;
public plasmaCrawlProfile.entry defaultProxyProfile;
public plasmaCrawlProfile.entry defaultRemoteProfile;
public plasmaCrawlProfile.entry defaultSnippetProfile;
public plasmaCrawlProfile.entry defaultTextSnippetProfile;
public plasmaCrawlProfile.entry defaultMediaSnippetProfile;
public boolean rankingOn;
public plasmaRankingDistribution rankingOwnDistribution;
public plasmaRankingDistribution rankingOtherDistribution;
@ -777,7 +778,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private void initProfiles() {
this.defaultProxyProfile = null;
this.defaultRemoteProfile = null;
this.defaultSnippetProfile = null;
this.defaultTextSnippetProfile = null;
this.defaultMediaSnippetProfile = null;
Iterator i = this.profiles.profiles(true);
plasmaCrawlProfile.entry profile;
String name;
@ -786,24 +788,30 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
name = profile.name();
if (name.equals("proxy")) this.defaultProxyProfile = profile;
if (name.equals("remote")) this.defaultRemoteProfile = profile;
if (name.equals("snippet")) this.defaultSnippetProfile = profile;
if (name.equals("snippetText")) this.defaultTextSnippetProfile = profile;
if (name.equals("snippetMedia")) this.defaultMediaSnippetProfile = profile;
}
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = this.profiles.newEntry("proxy", "", ".*", ".*",
Integer.parseInt(getConfig("proxyPrefetchDepth", "0")),
Integer.parseInt(getConfig("proxyPrefetchDepth", "0")),
60 * 24, -1, -1, false, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
60 * 24, -1, -1, false, true, true, true, true, getConfigBool("proxyCrawlOrder", false), true, true, true);
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
defaultRemoteProfile = this.profiles.newEntry("remote", "", ".*", ".*", 0, 0,
-1, -1, -1, true, false, true, true, false, true, true, false);
-1, -1, -1, true, true, true, false, true, false, true, true, false);
}
if (this.defaultSnippetProfile == null) {
if (this.defaultTextSnippetProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultSnippetProfile = this.profiles.newEntry("snippet", "", ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, true, true, true, false, true, true, false);
defaultTextSnippetProfile = this.profiles.newEntry("snippetText", "", ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, true, true, true, true, false, true, true, false);
}
if (this.defaultMediaSnippetProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetProfile = this.profiles.newEntry("snippetMedia", "", ".*", ".*", 0, 0,
60 * 24 * 30, -1, -1, true, false, true, true, true, false, true, true, false);
}
}
@ -830,7 +838,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
entry = (plasmaCrawlProfile.entry) iter.next();
if (!((entry.name().equals("proxy")) ||
(entry.name().equals("remote")) ||
(entry.name().equals("snippet")))) {
(entry.name().equals("snippetText")) ||
(entry.name().equals("snippetMedia")))) {
iter.remove();
hasDoneSomething = true;
}
@ -1575,7 +1584,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
plasmaCondenser condenser = new plasmaCondenser(document, true);
plasmaCondenser condenser = new plasmaCondenser(document, entry.profile().indexText(), entry.profile().indexMedia());
// generate citation reference
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
@ -1632,7 +1641,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(processCase == PROCESSCASE_5_LOCAL_CRAWLING) ||
(processCase == PROCESSCASE_6_GLOBAL_CRAWLING)
) &&
(entry.profile().localIndexing())
((entry.profile().indexText()) || (entry.profile().indexMedia()))
) {
String urlHash = newEntry.hash();
@ -1673,7 +1682,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry);
ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS);
ArrayList tmpContainers = new ArrayList(condenser.words().size());
String language = plasmaURL.language(entry.url());
char doctype = plasmaURL.docType(document.getMimeType());
@ -1695,8 +1704,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
urlLength, urlComps,
wordStat.count,
document.getMainLongTitle().length(),
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
condenser.words().size(),
condenser.sentences().size(),
wordStat.posInText,
wordStat.posInPhrase,
wordStat.numOfPhrase,
@ -1715,7 +1724,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
tmpContainers.add(wordIdxContainer);
}
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
words = condenser.RESULT_SIMI_WORDS;
words = condenser.words().size();
// transfering the index to the storage peer
indexContainer[] indexData = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]);
@ -1875,7 +1884,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2) + // count of links to other documents
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) + // length of plain text in bytes
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_SIMI_WORDS, 3) + // count of all unique words
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.words().size(), 3) + // count of all unique words
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
//crl.append(head); crl.append ('|'); crl.append(cpl); crl.append((char) 13); crl.append((char) 10);
@ -2249,7 +2258,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
InputStream resourceContent = null;
try {
// get the resource content
Object[] resource = snippetCache.getResource(comp.url(), fetchOnline, 10000);
Object[] resource = snippetCache.getResource(comp.url(), fetchOnline, 10000, true);
resourceContent = (InputStream) resource[0];
Long resourceContentLength = (Long) resource[1];
@ -2259,7 +2268,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// get the word set
Set words = null;
try {
words = new plasmaCondenser(document, true).words().keySet();
words = new plasmaCondenser(document, true, true).words().keySet();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}

@ -365,7 +365,7 @@ public class plasmaSwitchboardQueue {
}
// check profile
if (!profile().localIndexing()) {
if ((!profile().indexText()) && (!profile().indexMedia())) {
return "Indexing_Not_Allowed";
}
@ -420,7 +420,7 @@ public class plasmaSwitchboardQueue {
}
// check profile
if (!profile().localIndexing()) { return "Indexing_Not_Allowed"; }
if ((!profile().indexText()) && (!profile().indexMedia())) { return "Indexing_Not_Allowed"; }
final String nURL = normalizedURLString();
// -CGI access in request

@ -269,8 +269,8 @@ public final class plasmaWordIndex implements indexRI {
ientry = new indexRWIEntryNew(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
wprop.count,
condenser.RESULT_SIMI_WORDS,
condenser.RESULT_SIMI_SENTENCES,
condenser.words().size(),
condenser.sentences().size(),
wprop.posInText,
wprop.posInPhrase,
wprop.numOfPhrase,

@ -72,7 +72,7 @@ public class CrawlService extends AbstractService {
* Function to crawl a single link with depth <code>0</code>
*/
public Document crawlSingleUrl(String crawlingURL) throws AxisFault {
return this.crawling(crawlingURL, "CRAWLING-ROOT", new Integer(0), ".*", Boolean.TRUE, Boolean.TRUE, Boolean.TRUE, Boolean.FALSE, null, Boolean.TRUE);
return this.crawling(crawlingURL, "CRAWLING-ROOT", new Integer(0), ".*", Boolean.TRUE, Boolean.TRUE, Boolean.TRUE, Boolean.TRUE, Boolean.FALSE, null, Boolean.TRUE);
}
public Document crawling(
@ -80,7 +80,8 @@ public class CrawlService extends AbstractService {
String crawljobTitel,
Integer crawlingDepth,
String crawlingFilter,
Boolean localIndexing,
Boolean indexText,
Boolean indexMedia,
Boolean crawlingQ,
Boolean storeHTCache,
Boolean crawlOrder,
@ -100,8 +101,10 @@ public class CrawlService extends AbstractService {
args.put("crawlingFilter",crawlingFilter);
if (crawlingDepth != null && crawlingDepth.intValue() > 0)
args.put("crawlingDepth",crawlingDepth.toString());
if (localIndexing != null)
args.put("localIndexinglingQ",localIndexing.booleanValue()?"on":"off");
if (indexText != null)
args.put("indexText",indexText.booleanValue()?"on":"off");
if (indexMedia != null)
args.put("indexMedia",indexMedia.booleanValue()?"on":"off");
if (crawlingQ != null)
args.put("crawlingQ",crawlingQ.booleanValue()?"on":"off");
if (storeHTCache != null)

@ -53,13 +53,15 @@ public class urlRedirectord implements serverHandler {
// domMaxPages, if negative: no count restriction
-1,
// crawlDynamic
false,
false,
// indexText
true,
// indexMedia
true,
// storeHTCache
false,
// storeTxCache
true,
//localIndexing
true,
// remoteIndexing
false,
// xsstopw

@ -430,7 +430,8 @@ crawlingDepth=2
crawlingIfOlder=525600
crawlingDomFilterDepth=-1
crawlingDomMaxPages=-1
localIndexing=true
indexText=true
indexMedia=true
# Filter for crawlinig; may be used to restrict a crawl to a specific domain
# URLs are only indexed and further crawled if they match this filter

Loading…
Cancel
Save