implementation of search constraints

Such constraints may formulate specific restrictions to web searches
This is implemented by scraping information for constraints from a web
page during parsing, and storing flags to the pages within the web index.

In this first step, only information for index pages ("index of", directory listings)
are scraped and stored in flags
- added new flag class kelondroBitfield
- added scraper method in condenser
- added bitfield structure for all scrape types (see also condenser)
- added bitfield structure for appearance locations (see RWIEntry)
- added handover protocol for remote search and index distribution
- extended kelondroColumn class to hold bitfield types
- added another search attribute on search page (index.html)
- extended search-filter to enable filtering of non-matching constraints
- set all new database types to be default
- refactoring: moved word hash generation to condenser class

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2999 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 49a83f99d9
commit 30888e7a2f

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.486
releaseVersion=0.487
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -137,7 +137,7 @@ public class DetailedSearch {
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", count, searchtime, urlmask,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20);
"", 20, plasmaSearchQuery.catchall_constraint);
plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString());
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(4 * thisSearch.maximumTime / 10, thisSearch.wantedResults);
plasmaSearchTimingProfile remoteTiming = new plasmaSearchTimingProfile(6 * thisSearch.maximumTime / 10, thisSearch.wantedResults);

@ -61,6 +61,7 @@ import de.anomic.index.indexRWIEntry;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -175,7 +176,7 @@ public class IndexControl_p {
switchboard.wordIndex.deleteContainer(keyhash);
post.remove("keyhashdeleteall");
if (keystring.length() > 0 &&
plasmaURL.word2hash(keystring).equals(keyhash)) {
plasmaCondenser.word2hash(keystring).equals(keyhash)) {
post.put("keystringsearch", "generated");
} else {
post.put("keyhashsearch", "generated");
@ -198,7 +199,7 @@ public class IndexControl_p {
// this shall lead to a presentation of the list; so handle that the remaining program
// thinks that it was called for a list presentation
post.remove("keyhashdelete");
if (keystring.length() > 0 && plasmaURL.word2hash(keystring).equals(keyhash)) {
if (keystring.length() > 0 && plasmaCondenser.word2hash(keystring).equals(keyhash)) {
post.put("keystringsearch", "generated");
} else {
post.put("keyhashsearch", "generated");
@ -228,7 +229,7 @@ public class IndexControl_p {
}
if (post.containsKey("keystringsearch")) {
keyhash = plasmaURL.word2hash(keystring);
keyhash = plasmaCondenser.word2hash(keystring);
prop.put("keyhash", keyhash);
prop.put("urlstring", "");
prop.put("urlhash", "");
@ -236,7 +237,7 @@ public class IndexControl_p {
}
if (post.containsKey("keyhashsearch")) {
if (keystring.length() == 0 || !plasmaURL.word2hash(keystring).equals(keyhash)) {
if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) {
prop.put("keystring", "<not possible to compute word from hash>");
}
prop.put("urlstring", "");
@ -246,7 +247,7 @@ public class IndexControl_p {
// transfer to other peer
if (post.containsKey("keyhashtransfer")) {
if (keystring.length() == 0 || !plasmaURL.word2hash(keystring).equals(keyhash)) {
if (keystring.length() == 0 || !plasmaCondenser.word2hash(keystring).equals(keyhash)) {
prop.put("keystring", "<not possible to compute word from hash>");
}
prop.put("urlstring", "");

@ -59,6 +59,7 @@ import de.anomic.data.wikiCode;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaURL;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
@ -68,7 +69,6 @@ import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.serverThread;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNewsPool;
import de.anomic.yacy.yacyNewsRecord;
@ -204,7 +204,7 @@ public class IndexCreate_p {
prop.put("error_reasonString", reasonString);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new bitfield());
crawlingStartURL.getHost(), reasonString, new kelondroBitfield());
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
}
@ -282,7 +282,7 @@ public class IndexCreate_p {
c++;
} else {
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield());
(String) e.getValue(), rejectReason, new kelondroBitfield());
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
}

@ -1,6 +1,6 @@
<div class="SubMenu">
<ul class="SubMenu">
<li style="width:16%;"><a href="/Status.html?login=true" class="MenuItemLink">Peer Administration</a></li>
<li style="width:16%;"><a href="/Status.html" class="MenuItemLink">Peer Administration</a></li>
<li style="width:16%;"><a href="/index.html" onclick="this.href='/index.html?handover='+document.searchform.search.value" class="MenuItemLink">Web Search</a></li>
<li style="width:16%;"><a href="/Surftips.html" class="MenuItemLink">Surftips</a></li>
<li style="width:16%;"><a href="/ViewProfile.html?hash=localhash" class="MenuItemLink">Peer Owner Profile</a></li>

@ -62,6 +62,7 @@ import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSwitchboard;
@ -72,7 +73,6 @@ import de.anomic.server.serverMemory;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.dirlistComparator;
import de.anomic.tools.md5DirFileFilter;
import de.anomic.yacy.yacyCore;
@ -375,7 +375,7 @@ public class dir {
(long) phrase.length(), // size
condenser.RESULT_NUMB_WORDS, // word count
plasmaURL.DT_SHARE, // doctype
new bitfield(4),
new kelondroBitfield(4),
"**", // language
0,0,0,0,0,0
);
@ -399,7 +399,7 @@ public class dir {
Map.Entry entry;
while (words.hasNext()) {
entry = (Map.Entry) words.next();
switchboard.wordIndex.removeEntry(plasmaURL.word2hash((String) entry.getKey()), urlhash, true);
switchboard.wordIndex.removeEntry(plasmaCondenser.word2hash((String) entry.getKey()), urlhash, true);
}
switchboard.urlPool.loadedURL.remove(urlhash);
} catch (Exception e) {

@ -29,6 +29,7 @@
<input type="hidden" name="time" value="6" />
<input type="hidden" name="urlmaskfilter" value=".*" />
<input type="hidden" name="prefermaskfilter" value="" />
<input type="hidden" name="indexof" value="off" />
</fieldset>
<p><a href="/index.html?searchoptions=1&amp;display=#[display]#" onclick="this.href='/index.html?searchoptions=1&amp;display=#[display]#&amp;handover='+document.searchform.search.value">more options...</a></p>
::
@ -104,6 +105,14 @@
#(/prefermaskoptions)#
</td>
</tr>
<tr>
<td>
Constraints:
</td>
<td>
<input type="checkbox" name="indexof" #[indexofChecked]# /> only index pages
</td>
</tr>
</table>
#(/searchoptions)#
</form>

@ -108,6 +108,7 @@ public class index {
prop.put("searchoptions_urlmaskoptions_urlmaskfilter", ".*");
prop.put("searchoptions_prefermaskoptions", 0);
prop.put("searchoptions_prefermaskoptions_prefermaskfilter", "");
prop.put("searchoptions_indexofChecked", "");
prop.put("results", "");
prop.put("cat", "href");
prop.put("type", "0");

@ -8,6 +8,7 @@ import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
@ -37,7 +38,7 @@ public class snippet {
}
// do the search
Set queryHashes = plasmaSearchQuery.words2hashes(query);
Set queryHashes = plasmaCondenser.words2hashes(query);
plasmaSnippetCache.Snippet snippet = switchboard.snippetCache.retrieveSnippet(url, queryHashes, true, 260, 10000);
prop.put("status",snippet.getSource());

@ -51,13 +51,13 @@ import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
@ -156,7 +156,7 @@ public final class crawlReceipt {
} else {
try {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield());
plasmaCrawlEURL.Entry ee = switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new kelondroBitfield());
ee.store();
switchboard.urlPool.errorURL.stackPushEntry(ee);
switchboard.urlPool.noticeURL.remove(receivedUrlhash);

@ -52,6 +52,7 @@ import java.util.Map;
import java.util.Set;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.index.indexContainer;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
@ -93,6 +94,7 @@ public final class search {
final String prefer = post.get("prefer", "");
final String filter = post.get("filter", ".*");
final boolean includesnippet = post.get("includesnippet", "false").equals("true");
final kelondroBitfield constraint = kelondroBitfield(4, post.get("constraint", "______"));
// final boolean global = ((String) post.get("resource", "global")).equals("global"); // if true, then result may consist of answers from other peers
// Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time
@ -131,7 +133,7 @@ public final class search {
plasmaSearchQuery squery = null;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
squery = new plasmaSearchQuery(abstractSet, maxdist, prefer, count, duetime, filter);
squery = new plasmaSearchQuery(abstractSet, maxdist, prefer, count, duetime, filter, plasmaSearchQuery.catchall_constraint);
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + squery.anonymizedQueryHashes() + " - " + squery.wantedResults + " links");
@ -158,7 +160,7 @@ public final class search {
prop.put("joincount", 0);
} else {
// retrieve index containers from search request
squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, filter);
squery = new plasmaSearchQuery(keyhashes, maxdist, prefer, count, duetime, filter, constraint);
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + squery.anonymizedQueryHashes() + " - " + squery.wantedResults + " links");
@ -299,4 +301,9 @@ public final class search {
return prop;
}
private static kelondroBitfield kelondroBitfield(int i, String string) {
// TODO Auto-generated method stub
return null;
}
}

@ -35,6 +35,7 @@
<input type="hidden" name="cat" value="#[cat]#" />
<input type="hidden" name="type" value="#[type]#" />
<input type="hidden" name="display" value="#[display]#" />
<input type="hidden" name="constraint" value="#[constraint]#" />
</fieldset>
</form>
<script type="text/javascript">

@ -55,9 +55,11 @@ import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSearchImages;
import de.anomic.plasma.plasmaSearchPreOrder;
@ -113,6 +115,7 @@ public class yacysearch {
prop.put("time", 6);
prop.put("urlmaskfilter", ".*");
prop.put("prefermaskfilter", "");
prop.put("indexof", "off");
prop.put("cat", "href");
prop.put("depth", "0");
prop.put("type", 0);
@ -141,6 +144,7 @@ public class yacysearch {
final boolean indexDistributeGranted = sb.getConfig("allowDistributeIndex", "true").equals("true");
final boolean indexReceiveGranted = sb.getConfig("allowReceiveIndex", "true").equals("true");
if (!indexDistributeGranted || !indexReceiveGranted) { global = false; }
final boolean indexof = post.get("indexof","").equals("on");
final long searchtime = 1000 * Long.parseLong(post.get("time", "10"));
String urlmask = "";
if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) {
@ -151,6 +155,12 @@ public class yacysearch {
String prefermask = post.get("prefermaskfilter", "");
if ((prefermask.length() > 0) && (prefermask.indexOf(".*") < 0)) prefermask = ".*" + prefermask + ".*";
kelondroBitfield constraint = post.containsKey("constraint") ? new kelondroBitfield(4, post.get("constraint", "______")) : plasmaSearchQuery.catchall_constraint;
if (indexof) {
constraint = new kelondroBitfield();
constraint.set(plasmaCondenser.flag_cat_indexof, true);
}
serverObjects prop = new serverObjects();
if (post.get("cat", "href").equals("href")) {
@ -233,7 +243,7 @@ public class yacysearch {
urlmask,
((global) && (yacyonline) && (!(env.getConfig(
"last-search", "").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT
: plasmaSearchQuery.SEARCHDOM_LOCAL, "", 20);
: plasmaSearchQuery.SEARCHDOM_LOCAL, "", 20, constraint);
plasmaSearchRankingProfile ranking = new plasmaSearchRankingProfile( new String[] { order1, order2, order3 });
plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(4 * thisSearch.maximumTime / 10, thisSearch.wantedResults);
plasmaSearchTimingProfile remoteTiming = new plasmaSearchTimingProfile(6 * thisSearch.maximumTime / 10, thisSearch.wantedResults);
@ -393,6 +403,8 @@ public class yacysearch {
prop.put("urlmaskfilter", urlmask);
prop.put("prefermaskfilter", prefermask);
prop.put("display", display);
prop.put("indexof", (indexof) ? "on" : "off");
prop.put("constraint", constraint.exportB64());
// return rewrite properties
return prop;

@ -72,6 +72,7 @@ import org.xml.sax.SAXException;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroException;
@ -88,7 +89,7 @@ public class bookmarksDB {
HashMap bookmarkCache;
public static String tagHash(String tagName){
return plasmaURL.word2hash(tagName.toLowerCase());
return plasmaCondenser.word2hash(tagName.toLowerCase());
}
public static String dateToiso8601(Date date){
return new SimpleDateFormat("yyyy-MM-dd").format(date)+"T"+(new SimpleDateFormat("HH:mm:ss")).format(date)+"Z";

@ -27,6 +27,7 @@
package de.anomic.index;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroBitfield;
public interface indexRWIEntry {
@ -46,7 +47,7 @@ public interface indexRWIEntry {
public int phrasecount();
public String getLanguage();
public char getType();
public boolean isLocal();
public kelondroBitfield flags();
public void combineDistance(indexRWIEntry oe);
public int worddistance();

@ -27,6 +27,7 @@
package de.anomic.index;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
@ -53,7 +54,7 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
new kelondroColumn("m", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "urlLength"),
new kelondroColumn("n", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "urlComps"),
new kelondroColumn("g", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 1, "typeofword"),
new kelondroColumn("z", kelondroColumn.celltype_binary, kelondroColumn.encoder_bytes, 4, "flags"),
new kelondroColumn("z", kelondroColumn.celltype_bitfield, kelondroColumn.encoder_bytes, 4, "flags"),
new kelondroColumn("c", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "hitcount"),
new kelondroColumn("t", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 2, "posintext"),
new kelondroColumn("r", kelondroColumn.celltype_cardinal, kelondroColumn.encoder_b256, 1, "posinphrase"),
@ -79,7 +80,7 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
// dynamic properties
private static final int col_typeofword = 12; // g 1 grammatical classification
private static final int col_flags = 13; // z 4 b64-encoded flags; this has space for 24 bit
private static final int col_flags = 13; // z 4 b64-encoded appearance flags (24 bit, see definition below)
private static final int col_hitcount = 14; // c 1 number of occurrences of this word in text
private static final int col_posintext = 15; // t 2 first appearance of word in text
private static final int col_posinphrase = 16; // r 1 position of word in its phrase
@ -87,32 +88,37 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
private static final int col_worddistance = 18; // i 1 initial zero; may be used as reserve: is filled during search
private static final int col_reserve = 19; // k 1 reserve
// more needed attributes:
// - boolean: appearance attributes: title, appears in header, anchor-descr, image-tag, hervorhebungen, meta-tags, word in link, etc
// - boolean: URL attributes
// appearance flags, used in RWI entry
// the flags 0..15 are identical to the category flags in plasmaCondenser
public static final int flag_app_url = 16; // word appears in url
public static final int flag_app_descr = 17; // word appears in headline (or any description part)
public static final int flag_app_author = 18; // word appears in author
public static final int flag_app_tags = 19; // word appears in header tags
public static final int flag_app_reference = 20; // word appears in anchor description text (the reference to an url), or any alternative text field of a link
public static final int flag_app_emphasized = 21; // word is emphasized in text (i.e. bold, italics, special size)
private kelondroRow.Entry entry;
public indexRWIEntryNew(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
int hitcount, // how often appears this word in the text
int wordcount, // total number of words
int phrasecount, // total number of phrases
int posintext, // position of word in all words
int posinphrase, // position of word in its phrase
int posofphrase, // number of the phrase where word appears
int worddistance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
int sizeOfPage, // # of bytes of the page TODO: not needed any more
long lastmodified, // last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, // the entropy value
String language, // (guessed) language of document
char doctype, // type of document
int outlinksSame, // outlinks to same domain
int outlinksOther,// outlinks to other domain
boolean local // not needed. TODO: remove this
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
int titleLength, // length of description/length (longer are better?)
int hitcount, // how often appears this word in the text
int wordcount, // total number of words
int phrasecount, // total number of phrases
int posintext, // position of word in all words
int posinphrase, // position of word in its phrase
int posofphrase, // number of the phrase where word appears
int worddistance, // word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
int sizeOfPage, // # of bytes of the page TODO: not needed any more
long lastmodified, // last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, // the entropy value
String language, // (guessed) language of document
char doctype, // type of document
int outlinksSame, // outlinks to same domain
int outlinksOther, // outlinks to other domain
kelondroBitfield flags // attributes to the url and to the word according the url
) {
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
@ -133,7 +139,7 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
this.entry.setCol(col_urlLength, urlLength);
this.entry.setCol(col_urlComps, urlComps);
this.entry.setCol(col_typeofword, new byte[]{(byte) 0}); // TODO: grammatical classification
this.entry.setCol(col_flags, null); // TODO: generate flags
this.entry.setCol(col_flags, flags.bytes());
this.entry.setCol(col_hitcount, hitcount);
this.entry.setCol(col_posintext, posintext);
this.entry.setCol(col_posinphrase, posinphrase);
@ -160,7 +166,7 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
this.entry.setCol(col_urlLength, domlen * 2); // estimated
this.entry.setCol(col_urlComps, domlen / 3); // estimated
this.entry.setCol(col_typeofword, new byte[]{(byte) 0});
this.entry.setCol(col_flags, null);
this.entry.setCol(col_flags, (new kelondroBitfield(4)).bytes());
this.entry.setCol(col_hitcount, oldEntry.hitcount());
this.entry.setCol(col_posintext, oldEntry.posintext());
this.entry.setCol(col_posinphrase, oldEntry.posinphrase());
@ -254,8 +260,8 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
return (char) this.entry.getColByte(col_doctype);
}
public boolean isLocal() {
return false; // not used
public kelondroBitfield flags() {
return new kelondroBitfield(this.entry.getColBytes(col_flags));
}
public static indexRWIEntryNew combineDistance(indexRWIEntryNew ie1, indexRWIEntry ie2) {

@ -27,9 +27,11 @@
package de.anomic.index;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.yacy.yacySeedDB;
@ -200,8 +202,8 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
return (char) this.entry.getColByte(col_doctype);
}
public boolean isLocal() {
return this.entry.getColByte(col_localflag) == plasmaURL.LT_LOCAL;
public kelondroBitfield flags() {
return plasmaSearchQuery.empty_constraint;
}
public static indexRWIEntryOld combineDistance(indexRWIEntryOld ie1, indexRWIEntry ie2) {

@ -7,15 +7,16 @@ import java.util.Date;
import java.util.Properties;
import java.util.ArrayList;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.server.serverCharBuffer;
import de.anomic.server.serverCodings;
import de.anomic.tools.crypt;
import de.anomic.tools.bitfield;
import de.anomic.tools.nxTools;
public class indexURLEntryNew implements indexURLEntry {
@ -33,7 +34,7 @@ public class indexURLEntryNew implements indexURLEntry {
"Cardinal size-6 {b256}, " + // size of file in bytes
"Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds
"byte[] dt-1, " + // doctype, taken from extension or any other heuristic
"byte[] flags-4, " + // flags; any stuff (see Word-Entity definition)
"Bitfield flags-4, " + // flags; any stuff (see Word-Entity definition)
"String lang-2, " + // language
"Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width
"Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height
@ -61,7 +62,6 @@ public class indexURLEntryNew implements indexURLEntry {
private static final int col_lvideo = 16;
private static final int col_lapp = 17;
private kelondroRow.Entry entry;
private String snippet;
private indexRWIEntry word; // this is only used if the url is transported via remote search requests
@ -80,7 +80,7 @@ public class indexURLEntryNew implements indexURLEntry {
long size,
int wc,
char dt,
bitfield flags,
kelondroBitfield flags,
String lang,
int llocal,
int lother,
@ -100,7 +100,7 @@ public class indexURLEntryNew implements indexURLEntry {
this.entry.setCol(col_size, size);
this.entry.setCol(col_wc, wc);
this.entry.setCol(col_dt, new byte[]{(byte) dt});
this.entry.setCol(col_flags, flags.getBytes());
this.entry.setCol(col_flags, flags.bytes());
this.entry.setCol(col_lang, lang.getBytes());
this.entry.setCol(col_llocal, llocal);
this.entry.setCol(col_lother, lother);
@ -171,7 +171,8 @@ public class indexURLEntryNew implements indexURLEntry {
this.entry.setCol(col_size, Integer.parseInt(prop.getProperty("size", "0")));
this.entry.setCol(col_wc, Integer.parseInt(prop.getProperty("wc", "0")));
this.entry.setCol(col_dt, new byte[]{(byte) prop.getProperty("dt", "t").charAt(0)});
this.entry.setCol(col_flags, serverCodings.decodeHex(prop.getProperty("flags", "00000000")));
String flags = prop.getProperty("flags", "AAAAAA");
this.entry.setCol(col_flags, (flags.length() > 6) ? plasmaSearchQuery.empty_constraint.bytes() : (new kelondroBitfield(4, flags)).bytes());
this.entry.setCol(col_lang, prop.getProperty("lang", "uk").getBytes());
this.entry.setCol(col_llocal, Integer.parseInt(prop.getProperty("llocal", "0")));
this.entry.setCol(col_lother, Integer.parseInt(prop.getProperty("lother", "0")));
@ -208,7 +209,7 @@ public class indexURLEntryNew implements indexURLEntry {
s.append(",size=").append(size());
s.append(",wc=").append(wordCount());
s.append(",dt=").append(doctype());
s.append(",flags=").append(serverCodings.encodeHex(flags().getBytes()));
s.append(",flags=").append(flags().exportB64());
s.append(",lang=").append(language());
s.append(",llocal=").append(llocal());
s.append(",lother=").append(lother());
@ -289,8 +290,8 @@ public class indexURLEntryNew implements indexURLEntry {
return (int) this.entry.getColLong(col_size);
}
public bitfield flags() {
return new bitfield(this.entry.getColBytes(col_flags));
public kelondroBitfield flags() {
return new kelondroBitfield(this.entry.getColBytes(col_flags));
}
public int wordCount() {

@ -32,11 +32,11 @@ import java.util.Properties;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
import de.anomic.yacy.yacySeedDB;
@ -87,7 +87,7 @@ public class indexURLEntryOld implements indexURLEntry {
long size,
int wc,
char dt,
bitfield flags,
kelondroBitfield flags,
String lang,
int llocal,
int lother,

@ -52,10 +52,10 @@ import de.anomic.server.logging.serverLog;
public class kelondroBase64Order extends kelondroAbstractOrder implements kelondroOrder, kelondroCoding, Comparator {
private static final char[] alpha_standard = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".toCharArray();
private static final char[] alpha_enhanced = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_".toCharArray();
private static final byte[] ahpla_standard = new byte[128];
private static final byte[] ahpla_enhanced = new byte[128];
protected static final char[] alpha_standard = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".toCharArray();
protected static final char[] alpha_enhanced = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_".toCharArray();
protected static final byte[] ahpla_standard = new byte[128];
protected static final byte[] ahpla_enhanced = new byte[128];
static {
for (int i = 0; i < 128; i++) {

@ -0,0 +1,166 @@
// kelondroBitfield.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 22.22.2006 on http://www.anomic.de
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro;
public class kelondroBitfield {
// the bitfield implements a binary array. Such arrays may be exported in a base64-String
private byte[] bb;
public kelondroBitfield() {
this(0);
}
public kelondroBitfield(byte[] b) {
if (b == null) this.bb = new byte[0]; else this.bb = b;
}
public kelondroBitfield(int bytelength) {
this.bb= new byte[bytelength];
for (int i = 0 ; i < bytelength; i++) bb[i] = 0;
}
public kelondroBitfield(int bytelength, String exported) {
// imports a b64-encoded bitfield
byte[] b = kelondroBase64Order.enhancedCoder.decode(exported);
if (b.length == bytelength) {
bb = b;
} else {
bb = new byte[bytelength];
assert (b.length <= bytelength) : "exported = " + exported + " has bytelength = " + b.length + " > " + bytelength;
System.arraycopy(b, 0, bb, 0, Math.min(b.length, bytelength));
}
}
public void set(int pos, boolean value) {
assert (pos >= 0);
int slot = pos / 8;
if (slot >= bb.length) {
// extend capacity
byte[] nb = new byte[slot + 1];
System.arraycopy(bb, 0, nb, 0, bb.length);
for (int i = bb.length; i < nb.length; i++) nb[i] = 0;
bb = nb;
nb = null;
}
if (value) {
bb[slot] = (byte) (bb[slot] | (1 << (pos % 8)));
} else {
bb[slot] = (byte) (bb[slot] & (0xff ^ (1 << (pos % 8))));
}
}
public boolean get(int pos) {
assert (pos >= 0);
int slot = pos / 8;
if (slot > bb.length) return false;
return (bb[slot] & (1 << (pos % 8))) > 0;
}
public int length() {
return bb.length * 8;
}
public String exportB64() {
return kelondroBase64Order.enhancedCoder.encode(bb);
}
public byte[] bytes() {
return bb;
}
public String toString() {
StringBuffer sb = new StringBuffer(length());
for (int i = length() - 1; i >= 0; i--) sb.append((this.get(i)) ? '1' : '0');
return sb.toString();
}
public boolean equals(kelondroBitfield x) {
if (x.bb.length != bb.length) return false;
for (int i = 0; i < bb.length; i++) if (bb[i] != x.bb[i]) return false;
return true;
}
public void and(kelondroBitfield x) {
int c = Math.min(x.length(), this.length());
for (int i = 0; i < c; i++) set(i, this.get(i) && x.get(i));
}
public void or(kelondroBitfield x) {
int c = Math.min(x.length(), this.length());
for (int i = 0; i < c; i++) set(i, this.get(i) || x.get(i));
if (x.length() > c) {
for (int i = c; i < x.length(); i++) set(i, x.get(i));
}
}
public void xor(kelondroBitfield x) {
int c = Math.min(x.length(), this.length());
for (int i = 0; i < c; i++) set(i, this.get(i) != x.get(i));
if (x.length() > c) {
for (int i = c; i < x.length(); i++) set(i, x.get(i));
}
}
public boolean anyOf(kelondroBitfield x) {
int c = Math.min(x.length(), this.length());
for (int i = 0; i < c; i++) if ((x.get(i)) && (this.get(i))) return true;
return false;
}
public boolean allOf(kelondroBitfield x) {
int c = Math.min(x.length(), this.length());
for (int i = 0; i < c; i++) if ((x.get(i)) && (!(this.get(i)))) return false;
if (x.length() > c) {
for (int i = c; i < x.length(); i++) if (x.get(i)) return false;
}
return true;
}
public static void main(String[] args) {
kelondroBitfield test = new kelondroBitfield(4);
int l = test.length();
System.out.println("available: " + l);
System.out.println("bevore: " + test.toString());
for (int i = 0; i < l/2; i++) {
System.out.println(new String(test.exportB64()));
test.set(i, true);
System.out.println(i + ":" + test.toString());
}
for (int i = l/2; i < l; i++) {
System.out.println(new String(test.exportB64()));
test = new kelondroBitfield(4, test.exportB64());
test.set(i, true);
System.out.println(i + ":" + test.toString());
}
System.out.println(new String(test.exportB64()));
for (int i = l - 1; i >= 0; i--) {
test.set(i, false);
System.out.println(i + ":" + test.toString());
}
System.out.println("after: " + test.toString());
}
}

@ -34,6 +34,7 @@ public class kelondroColumn {
public static final int celltype_binary = 2;
public static final int celltype_string = 3;
public static final int celltype_cardinal = 4;
public static final int celltype_bitfield = 5;
public static final int encoder_none = 0;
public static final int encoder_b64e = 1;
@ -98,6 +99,9 @@ public class kelondroColumn {
} else if (typename.equals("Cardinal")) {
this.celltype = celltype_cardinal;
this.cellwidth = -1; // yet undefined
} else if (typename.equals("Bitfield")) {
this.celltype = celltype_bitfield;
this.cellwidth = -1; // yet undefined
} else {
throw new kelondroException("kelondroColumn - undefined type def '" + typename + "'");
}
@ -236,6 +240,12 @@ public class kelondroColumn {
s.append('-');
s.append(cellwidth);
break;
case celltype_bitfield:
s.append("Bitfield ");
s.append(nickname);
s.append('-');
s.append(cellwidth);
break;
}
switch (encoder) {

@ -65,7 +65,7 @@ public class kelondroFlexWidthArray implements kelondroArray {
String stored_rowdef = (String) props.get("rowdef");
if ((stored_rowdef == null) || (!(rowdef.subsumes(new kelondroRow(stored_rowdef))))) {
System.out.println("FATAL ERROR: stored rowdef '" + stored_rowdef + "' does not match with new rowdef '" +
rowdef + "' for flex table '" + path + "'");
rowdef + "' for flex table '" + path + "', table " + tablename);
System.exit(-1);
}
}

@ -209,6 +209,8 @@ public class kelondroRow {
} catch (NumberFormatException e) {
setCol(nick, 0);
}
} else if ((decimalCardinal) && (row[i].celltype() == kelondroColumn.celltype_bitfield)) {
setCol(nick, (new kelondroBitfield(row[i].cellwidth(), elts[i].substring(p + 1).trim())).bytes());
} else {
setCol(nick, elts[i].substring(p + 1).trim().getBytes());
}
@ -259,6 +261,12 @@ public class kelondroRow {
setCol(row[column].encoder(), colstart[column], row[column].cellwidth(), cell);
}
public void setCol(int column, char[] cell) {
int offset = colstart[column];
for (int i = 0; i < cell.length; i++) rowinstance[offset + i] = (byte) cell[i];
for (int i = cell.length; i < row[column].cellwidth(); i++) rowinstance[offset + i] = 0;
}
private void setCol(int encoding, int offset, int length, byte[] cell) {
if (cell == null) {
while (length-- > 0) rowinstance[offset + length] = 0;
@ -419,16 +427,25 @@ public class kelondroRow {
return c;
}
public char[] getColChars(int column) {
char[] c = new char[row[column].cellwidth()];
System.arraycopy(rowinstance, colstart[column], c, 0, row[column].cellwidth());
return c;
}
public String toPropertyForm(boolean includeBraces, boolean decimalCardinal, boolean longname) {
serverByteBuffer bb = new serverByteBuffer();
if (includeBraces) bb.append('{');
for (int i = 0; i < row.length; i++) {
bb.append((longname) ? row[i].description() : row[i].nickname());
bb.append('=');
if ((decimalCardinal) && (row[i].celltype() == kelondroColumn.celltype_cardinal))
if ((decimalCardinal) && (row[i].celltype() == kelondroColumn.celltype_cardinal)) {
bb.append(Long.toString(getColLong(i)));
else
} else if ((decimalCardinal) && (row[i].celltype() == kelondroColumn.celltype_bitfield)) {
bb.append((new kelondroBitfield(getColBytes(i))).exportB64());
} else {
bb.append(rowinstance, colstart[i], row[i].cellwidth());
}
if (i < row.length - 1) {
bb.append(',');
if (longname) bb.append(' ');
@ -475,6 +492,9 @@ public class kelondroRow {
// and possibly some more
if (this.objectsize < otherRow.objectsize) return false;
for (int i = 0; i < otherRow.row.length; i++) {
if ((this.row[i].cellwidth() == otherRow.row[i].cellwidth()) &&
(this.row[i].celltype() == kelondroColumn.celltype_bitfield) &&
(otherRow.row[i].celltype() == kelondroColumn.celltype_binary)) continue;
if (!(this.row[i].equals(otherRow.row[i]))) return false;
}
return true;

@ -51,6 +51,7 @@ import java.io.File;
import java.io.IOException;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLoaderMessage;
@ -58,7 +59,6 @@ import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacyCore;
public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlWorker {
@ -297,7 +297,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
yacyCore.seedDB.mySeed.hash,
this.name,
(failreason==null)?"Unknown reason":failreason,
new bitfield()
new kelondroBitfield()
);
// store the entry

@ -47,6 +47,8 @@ package de.anomic.plasma;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
@ -58,14 +60,45 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
public final class plasmaCondenser {
// this is the page analysis class
// category flags that show how the page can be distinguished in different interest groups
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
public static final int flag_cat_opencontent = 1; // open source, any free stuff
public static final int flag_cat_business = 2; // web shops, marketing, trade
public static final int flag_cat_stockfinance = 3; // stock exchange (quotes), finance, economy
public static final int flag_cat_health = 4; // health
public static final int flag_cat_sport = 5; // any sport, cars etc.
public static final int flag_cat_lifestyle = 6; // travel, lifestyle
public static final int flag_cat_politics = 7; // politics
public static final int flag_cat_news = 8; // blogs, news pages
public static final int flag_cat_children = 9; // toys, childrens education, help for parents
public static final int flag_cat_entertainment = 10; // boulevard, entertainment, cultural content
public static final int flag_cat_knowledge = 11; // science, school stuff, help for homework
public static final int flag_cat_computer = 12; // any computer related stuff, networks, operation systems
public static final int flag_cat_p2p = 13; // p2p support, filesharing archives etc.
public static final int flag_cat_sex = 14; // sexual content
public static final int flag_cat_spam = 15; // pages that anybody would consider as not interesting
public static final int flag_cat_linux = 16; // pages about linux software
public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os
public static final int flag_cat_windows = 18; // pages about windows os and softare
public static final int flag_cat_osreserve = 19; // reserve
private final static int numlength = 5;
//private Properties analysis;
@ -82,6 +115,7 @@ public final class plasmaCondenser {
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public int RESULT_SIMI_SENTENCES = -1;
public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
public plasmaCondenser(InputStream text) {
this(text, 3, 2);
@ -96,6 +130,30 @@ public final class plasmaCondenser {
createCondensement(text);
}
// create a word hash
public static final String word2hash(String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength);
}
public static final Set words2hashSet(String[] words) {
TreeSet hashes = new TreeSet();
for (int i = 0; i < words.length; i++) hashes.add(word2hash(words[i]));
return hashes;
}
public static final String words2hashString(String[] words) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < words.length; i++) sb.append(word2hash(words[i]));
return new String(sb);
}
public static final Set words2hashes(Set words) {
Iterator i = words.iterator();
TreeSet hashes = new TreeSet();
while (i.hasNext()) hashes.add(word2hash((String) i.next()));
return hashes;
}
public int excludeWords(TreeSet stopwords) {
// subtracts the given stopwords from the word list
// the word list shrinkes. This returns the number of shrinked words
@ -186,12 +244,15 @@ public final class plasmaCondenser {
int idx;
int wordInSentenceCounter = 1;
Iterator it, it1;
boolean comb_indexof = false, comb_lastmodified = false, last_last = false, last_index = false;
// read source
sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize);
while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
// System.out.println("PARSED-WORD " + word);
// distinguish punctuation and words
wordlen = word.length();
if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) {
// store sentence
@ -223,6 +284,12 @@ public final class plasmaCondenser {
currsentwords.clear();
wordInSentenceCounter = 1;
} else {
// check index.of detection
if ((last_last) && (word.equals("modified"))) comb_lastmodified = true;
if ((last_index) && (word.equals("of"))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");
// store word
allwordcounter++;
currsentwords.add(word);
@ -353,7 +420,7 @@ public final class plasmaCondenser {
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size();
//this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
this.RESULT_FLAGS.set(flag_cat_indexof, comb_indexof && comb_lastmodified);
}
public void print() {
@ -505,7 +572,7 @@ public final class plasmaCondenser {
loop: while (e.hasMoreElements()) {
s = (String) e.nextElement();
if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
if (s.length() < ml) continue loop;
if ((s.length() < ml) && (!(s.equals("of")))) continue loop;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
// TODO: Bugfix needed for UTF-8
@ -730,27 +797,6 @@ public final class plasmaCondenser {
return new String(s);
}
/*
private static void addLineSearchProp(Properties prop, String s, String[] searchwords, HashSet foundsearch) {
// we store lines containing a key in search vector
int p;
String r;
s = " " + s.toLowerCase() + " ";
for (int i = 0; i < searchwords.length; i++) {
if (!(foundsearch.contains(searchwords[i]))) {
p = s.indexOf((String) searchwords[i]);
if (p >= 0) {
// we found one key in the result text
// prepare a line and put it to the property
r = s.substring(0, p) + "<B>" + s.substring(p, p + searchwords[i].length()) + "</B>" + s.substring(p + searchwords[i].length());
prop.setProperty("key-" + searchwords[i], r);
// remember that we found this
foundsearch.add(searchwords[i]);
}
}
}
}
*/
public static Iterator getWords(InputStream input) {
if (input == null) return null;
@ -765,47 +811,30 @@ public final class plasmaCondenser {
}
public static void main(String[] args) {
// if ((args.length == 0) || (args.length > 3))
// System.out.println("wrong number of arguments: plasmaCondenser -text|-html <infile> <outfile>");
// else
// try {
// plasmaCondenser pc = null;
//
// // read and analyse file
// File file = new File(args[1]);
// InputStream textStream = null;
// if (args[0].equals("-text")) {
// // read a text file
// textStream = new FileInputStream(file);
// } else if (args[0].equals("-html")) {
// // read a html file
// htmlFilterContentScraper cs = new htmlFilterContentScraper(new de.anomic.net.URL("http://localhost/"));
// htmlFilterOutputStream fos = new htmlFilterOutputStream(null, cs, null, false);
// FileInputStream fis = new FileInputStream(file);
// byte[] buffer = new byte[512];
// int i;
// while ((i = fis.read(buffer)) > 0) fos.write(buffer, 0, i);
// fis.close();
// fos.close();
// // cs.print();
// // System.out.println("TEXT:" + new String(cs.getText()));
// textStream = new ByteArrayInputStream(cs.getText());
// } else {
// System.out.println("first argument must be either '-text' or '-html'");
// System.exit(-1);
// }
//
// // call condenser
// pc = new plasmaCondenser(textStream, 1, 0);
// textStream.close();
//
// // output result
// pc.writeMapToFile(new File(args[2]));
// pc.print();
// //System.out.println("ANALYSIS:" + pc.getAnalysis().toString());
// } catch (IOException e) {
// System.out.println("Problem with input file: " + e.getMessage());
// }
// read a property file and converty them into configuration lines
try {
File f = new File(args[0]);
Properties p = new Properties();
p.load(new FileInputStream(f));
StringBuffer sb = new StringBuffer();
sb.append("{\n");
for (int i = 0; i <= 15; i++) {
sb.append('"');
String s = p.getProperty("keywords" + i);
String[] l = s.split(",");
for (int j = 0; j < l.length; j++) {
sb.append(word2hash(l[j]));
}
if (i < 15) sb.append(",\n");
}
sb.append("}\n");
System.out.println(new String(sb));
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

@ -55,13 +55,13 @@ import java.util.LinkedList;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlEURL {
@ -215,7 +215,7 @@ public class plasmaCrawlEURL {
}
public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor,
String name, String failreason, bitfield flags) {
String name, String failreason, kelondroBitfield flags) {
if ((referrer == null) || (referrer.length() < yacySeedDB.commonHashLength)) referrer = plasmaURL.dummyHash;
if ((initiator == null) || (initiator.length() < yacySeedDB.commonHashLength)) initiator = plasmaURL.dummyHash;
if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = plasmaURL.dummyHash;
@ -279,11 +279,11 @@ public class plasmaCrawlEURL {
private Date trydate; // the time when the url was last time tried to load
private int trycount; // number of tryings
private String failreason; // string describing reason for load fail
private bitfield flags; // extra space
private kelondroBitfield flags; // extra space
private boolean stored;
public Entry(URL url, String referrer, String initiator,
String executor, String name, String failreason, bitfield flags) {
String executor, String name, String failreason, kelondroBitfield flags) {
// create new entry
this.hash = plasmaURL.urlHash(url);
this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer;
@ -333,7 +333,7 @@ public class plasmaCrawlEURL {
this.trydate = new Date(86400000 * entry.getColLong(7));
this.trycount = (int) entry.getColLong(8);
this.failreason = entry.getColString(9, "UTF-8");
this.flags = new bitfield(entry.getColBytes(10));
this.flags = new kelondroBitfield(entry.getColBytes(10));
return;
}
@ -358,7 +358,7 @@ public class plasmaCrawlEURL {
trydatestr.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, rowdef.width(8)).getBytes(),
this.failreason.getBytes(),
this.flags.getBytes()
this.flags.bytes()
};
urlIndexFile.put(urlIndexFile.row().newEntry(entry));
this.stored = true;

@ -67,6 +67,7 @@ import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryNew;
import de.anomic.index.indexURLEntryOld;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroFlexSplitTable;
import de.anomic.kelondro.kelondroBase64Order;
@ -78,7 +79,6 @@ import de.anomic.net.URL;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCodings;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacySeedDB;
public final class plasmaCrawlLURL {
@ -273,7 +273,7 @@ public final class plasmaCrawlLURL {
long size,
int wc,
char dt,
bitfield flags,
kelondroBitfield flags,
String lang,
int llocal,
int lother,

@ -52,6 +52,7 @@ import java.util.Iterator;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroFlexTable;
@ -62,7 +63,6 @@ import de.anomic.kelondro.kelondroStack;
import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlNURL {
@ -90,7 +90,7 @@ public class plasmaCrawlNURL {
"Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b64e}, " + // number of anchors of the parent
"Cardinal forkfactor-4 {b64e}, " + // sum of anchors of all ancestors
"byte[] flags-2, " + // flags
"byte[] flags-4, " + // flags
"String handle-4" // extra handle
);
@ -196,7 +196,7 @@ public class plasmaCrawlNURL {
private void openHashCache() {
if (newdb) {
String newCacheName = "urlNotice4.table";
String newCacheName = "urlNotice5.table";
cacheStacksPath.mkdirs();
try {
urlIndexFile = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, bufferkb / 2 * 0x400, preloadTime, rowdef, kelondroBase64Order.enhancedCoder), bufferkb / 2 * 0x400, true, false);
@ -499,7 +499,7 @@ public class plasmaCrawlNURL {
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private bitfield flags;
private kelondroBitfield flags;
private int handle;
private boolean stored;
@ -524,7 +524,7 @@ public class plasmaCrawlNURL {
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new bitfield(rowdef.width(10));
this.flags = new kelondroBitfield(rowdef.width(10));
this.handle = 0;
this.stored = false;
}
@ -570,7 +570,7 @@ public class plasmaCrawlNURL {
this.depth = (int) entry.getColLong(7);
this.anchors = (int) entry.getColLong(8);
this.forkfactor = (int) entry.getColLong(9);
this.flags = new bitfield(entry.getColBytes(10));
this.flags = new kelondroBitfield(entry.getColBytes(10));
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
return;
}
@ -593,7 +593,7 @@ public class plasmaCrawlNURL {
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, rowdef.width(7)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, rowdef.width(8)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, rowdef.width(9)).getBytes(),
this.flags.getBytes(),
this.flags.bytes(),
normalizeHandle(this.handle).getBytes()
};
if (urlIndexFile == null) System.out.println("urlHashCache is NULL");
@ -622,7 +622,7 @@ public class plasmaCrawlNURL {
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
.append("depth: ").append(Integer.toString(depth)).append(" | ")
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
.append("flags: ").append((flags==null) ? "null" : flags.toString());
.append("flags: ").append((flags==null) ? "null" : flags.exportB64());
return str.toString();
}

@ -62,6 +62,7 @@ import de.anomic.http.httpc;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroFlexTable;
@ -74,7 +75,6 @@ import de.anomic.net.URL;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverSemaphore;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacyCore;
public final class plasmaCrawlStacker {
@ -465,7 +465,7 @@ public final class plasmaCrawlStacker {
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private bitfield flags;
private kelondroBitfield flags;
private int handle;
// loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) {
@ -491,7 +491,7 @@ public final class plasmaCrawlStacker {
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new bitfield();
this.flags = new kelondroBitfield();
this.handle = 0;
} catch (Exception e) {
e.printStackTrace();
@ -513,7 +513,7 @@ public final class plasmaCrawlStacker {
this.depth = (int) entry.getColLong(7);
this.anchors = (int) entry.getColLong(8);
this.forkfactor = (int) entry.getColLong(9);
this.flags = new bitfield(entry.getColBytes(10));
this.flags = new kelondroBitfield(entry.getColBytes(10));
try {
this.handle = Integer.parseInt(new String(entry.getColBytes(11), "UTF-8"));
} catch (NumberFormatException ee) {
@ -591,7 +591,7 @@ public final class plasmaCrawlStacker {
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, plasmaCrawlNURL.rowdef.width(7)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, plasmaCrawlNURL.rowdef.width(8)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, plasmaCrawlNURL.rowdef.width(9)).getBytes(),
this.flags.getBytes(),
this.flags.bytes(),
normalizeHandle(this.handle).getBytes()
};
} catch (UnsupportedEncodingException e) { /* ignore this */ }
@ -671,7 +671,7 @@ public final class plasmaCrawlStacker {
// do nothing..
}
if (this.dbtype == QUEUE_DB_TYPE_FLEX) {
kelondroFlexWidthArray.delete(cacheStacksPath, "urlPreNotice1.table");
kelondroFlexWidthArray.delete(cacheStacksPath, "urlPreNotice2.table");
}
if (this.dbtype == QUEUE_DB_TYPE_TREE) {
File cacheFile = new File(cacheStacksPath, "urlPreNotice.db");
@ -686,7 +686,7 @@ public final class plasmaCrawlStacker {
this.urlEntryCache = new kelondroRowSet(plasmaCrawlNURL.rowdef, kelondroBase64Order.enhancedCoder, 0, 0);
}
if (this.dbtype == QUEUE_DB_TYPE_FLEX) {
String newCacheName = "urlPreNotice1.table";
String newCacheName = "urlPreNotice2.table";
cacheStacksPath.mkdirs();
try {
this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, bufferkb / 2 * 0x400, preloadTime, plasmaCrawlNURL.rowdef, kelondroBase64Order.enhancedCoder), bufferkb / 2 * 0x400, true, false);
@ -1058,7 +1058,7 @@ public final class plasmaCrawlStacker {
yacyCore.seedDB.mySeed.hash,
this.theMsg.name,
rejectReason,
new bitfield()
new kelondroBitfield()
);
ee.store();
sb.urlPool.errorURL.stackPushEntry(ee);

@ -52,6 +52,7 @@ import java.util.Map;
import de.anomic.kelondro.kelondroAttrSeq;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCollectionIndex;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
@ -60,7 +61,6 @@ import de.anomic.kelondro.kelondroRowSet;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverMemory;
import de.anomic.tools.bitfield;
public class plasmaRankingCRProcess {
@ -98,11 +98,11 @@ public class plasmaRankingCRProcess {
String key;
kelondroAttrSeq.Entry new_entry, acc_entry;
int FUDate, FDDate, LUDate, UCount, PCount, ACount, VCount, Vita;
bitfield acc_flags, new_flags;
kelondroBitfield acc_flags, new_flags;
while (el.hasNext()) {
key = (String) el.next();
new_entry = source_cr.getEntry(key);
new_flags = new bitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) new_entry.getAttr("Flags", 0), 1).getBytes());
new_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) new_entry.getAttr("Flags", 0), 1).getBytes());
// enrich information with additional values
if ((acc_entry = acc.getEntry(key)) != null) {
FUDate = (int) acc_entry.getAttr("FUDate", 0);
@ -123,11 +123,11 @@ public class plasmaRankingCRProcess {
VCount += (new_flags.get(3)) ? 1 : 0;
// 'OR' the flags
acc_flags = new bitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) acc_entry.getAttr("Flags", 0), 1).getBytes());
acc_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) acc_entry.getAttr("Flags", 0), 1).getBytes());
for (int i = 0; i < 6; i++) {
if (new_flags.get(i)) acc_flags.set(i, true);
}
acc_entry.setAttr("Flags", (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(acc_flags.getBytes())));
acc_entry.setAttr("Flags", (int) kelondroBase64Order.enhancedCoder.decodeLong(acc_flags.exportB64()));
} else {
// initialize counters and dates
acc_entry = acc.newEntry(key, new_entry.getAttrs(), new_entry.getSeqSet());
@ -172,11 +172,11 @@ public class plasmaRankingCRProcess {
kelondroAttrSeq.Entry new_entry;
kelondroRow.Entry acc_entry;
int FUDate, FDDate, LUDate, UCount, PCount, ACount, VCount, Vita;
bitfield acc_flags, new_flags;
kelondroBitfield acc_flags, new_flags;
while (el.hasNext()) {
key = (String) el.next();
new_entry = source_cr.getEntry(key);
new_flags = new bitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) new_entry.getAttr("Flags", 0), 1).getBytes());
new_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong((long) new_entry.getAttr("Flags", 0), 1).getBytes());
// enrich information with additional values
if ((acc_entry = acc.get(key.getBytes())) != null) {
FUDate = (int) acc_entry.getColLong("FUDate", 0);
@ -197,11 +197,11 @@ public class plasmaRankingCRProcess {
VCount += (new_flags.get(3)) ? 1 : 0;
// 'OR' the flags
acc_flags = new bitfield(kelondroBase64Order.enhancedCoder.encodeLong(acc_entry.getColLong("Flags", 0), 1).getBytes());
acc_flags = new kelondroBitfield(kelondroBase64Order.enhancedCoder.encodeLong(acc_entry.getColLong("Flags", 0), 1).getBytes());
for (int i = 0; i < 6; i++) {
if (new_flags.get(i)) acc_flags.set(i, true);
}
acc_entry.setCol("Flags", (int) kelondroBase64Order.enhancedCoder.decodeLong(new String(acc_flags.getBytes())));
acc_entry.setCol("Flags", (int) kelondroBase64Order.enhancedCoder.decodeLong(acc_flags.exportB64()));
} else {
// initialize counters and dates
acc_entry = acc.row().newEntry();

@ -140,7 +140,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
long primaryTimeout = System.currentTimeMillis() + profileGlobal.duetime();
primarySearchThreads = yacySearch.primaryRemoteSearches(plasmaSearchQuery.hashSet2hashString(query.queryHashes), "",
query.prefer, query.urlMask, query.maxDistance, urlStore, wordIndex, rcContainers, rcAbstracts,
fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking);
fetchpeers, plasmaSwitchboard.urlBlacklist, snippetCache, profileGlobal, ranking, query.constraint);
// meanwhile do a local search
Map searchContainerMap = localSearchContainers(null);
@ -281,7 +281,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words);
secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch(
words, urls, urlStore, wordIndex, rcContainers, peer, plasmaSwitchboard.urlBlacklist, snippetCache,
profileGlobal, ranking);
profileGlobal, ranking, query.constraint);
}
}

@ -100,6 +100,7 @@ public final class plasmaSearchPreOrder {
for (int j = 0; j < count; j++) {
iEntry = (indexRWIEntry) i.next();
if (iEntry.urlHash().length() != container.row().width(container.primarykey())) continue;
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint
pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
}
}

@ -48,6 +48,7 @@ import java.util.Set;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterAbstractScraper;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.server.serverCharBuffer;
import de.anomic.yacy.yacySeedDB;
@ -60,6 +61,9 @@ public final class plasmaSearchQuery {
public static final int SEARCHDOM_GLOBALDHT = 3;
public static final int SEARCHDOM_GLOBALALL = 4;
public static final kelondroBitfield empty_constraint = new kelondroBitfield(4, "AAAAAA");
public static final kelondroBitfield catchall_constraint = new kelondroBitfield(4, "______");
public Set queryWords, queryHashes;
public int wantedResults;
public String prefer;
@ -69,24 +73,28 @@ public final class plasmaSearchQuery {
public String domGroupName;
public int domMaxTargets;
public int maxDistance;
public kelondroBitfield constraint;
public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer,
int wantedResults, long maximumTime, String urlMask,
int domType, String domGroupName, int domMaxTargets) {
int domType, String domGroupName, int domMaxTargets,
kelondroBitfield constraint) {
this.queryWords = queryWords;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.queryHashes = words2hashes(queryWords);
this.queryHashes = plasmaCondenser.words2hashes(queryWords);
this.wantedResults = wantedResults;
this.maximumTime = maximumTime;
this.urlMask = urlMask;
this.domType = domType;
this.domGroupName = domGroupName;
this.domMaxTargets = domMaxTargets;
this.constraint = constraint;
}
public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer,
int wantedResults, long maximumTime, String urlMask) {
int wantedResults, long maximumTime, String urlMask,
kelondroBitfield constraint) {
this.queryWords = null;
this.maxDistance = maxDistance;
this.prefer = prefer;
@ -97,25 +105,7 @@ public final class plasmaSearchQuery {
this.domType = -1;
this.domGroupName = null;
this.domMaxTargets = -1;
}
public static Set words2hashSet(String[] words) {
TreeSet hashes = new TreeSet();
for (int i = 0; i < words.length; i++) hashes.add(plasmaURL.word2hash(words[i]));
return hashes;
}
public static String words2hashString(String[] words) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < words.length; i++) sb.append(plasmaURL.word2hash(words[i]));
return new String(sb);
}
public static Set words2hashes(Set words) {
Iterator i = words.iterator();
TreeSet hashes = new TreeSet();
while (i.hasNext()) hashes.add(plasmaURL.word2hash((String) i.next()));
return hashes;
this.constraint = constraint;
}
public static Set hashes2Set(String query) {

@ -211,8 +211,8 @@ public class plasmaSearchRankingProfile {
}
// apply query-in-result matching
Set urlcomph = plasmaSearchQuery.words2hashSet(urlcomps);
Set descrcomph = plasmaSearchQuery.words2hashSet(descrcomps);
Set urlcomph = plasmaCondenser.words2hashSet(urlcomps);
Set descrcomph = plasmaCondenser.words2hashSet(descrcomps);
Iterator shi = query.queryHashes.iterator();
String queryhash;
while (shi.hasNext()) {

@ -230,7 +230,7 @@ public final class plasmaSearchResult {
word = words[i].toLowerCase();
if ((word.length() > 2) &&
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
(!(query.queryHashes.contains(plasmaURL.word2hash(word)))))
(!(query.queryHashes.contains(plasmaCondenser.word2hash(word)))))
ref.incScore(word);
}
}

@ -159,7 +159,7 @@ public class plasmaSnippetCache {
}
//end contrib [MN]
if (plasmaURL.word2hash(w[j]).equals(h)) w[j] = "<b>" + w[j] + "</b>";
if (plasmaCondenser.word2hash(w[j]).equals(h)) w[j] = "<b>" + w[j] + "</b>";
w[j] = prefix + w[j] + postfix;
}
}
@ -480,7 +480,7 @@ public class plasmaSnippetCache {
String word;
while (words.hasMoreElements()) {
word = (String) words.nextElement();
map.put(plasmaURL.word2hash(word), new Integer(pos));
map.put(plasmaCondenser.word2hash(word), new Integer(pos));
pos += word.length() + 1;
}
return map;

@ -136,6 +136,7 @@ import de.anomic.index.indexRWIEntry;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroMapTable;
@ -155,7 +156,6 @@ import de.anomic.server.serverSemaphore;
import de.anomic.server.serverSwitch;
import de.anomic.server.serverThread;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyClient;
@ -1497,7 +1497,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (document == null) return;
} catch (ParserException e) {
this.log.logInfo("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new bitfield());
addURLtoErrorDB(entry.url(), entry.referrerHash(), initiatorPeerHash, entry.anchorName(), e.getErrorCode(), new kelondroBitfield());
if (document != null) {
document.close();
document = null;
@ -1587,9 +1587,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
new byte[0], // md5
(int) entry.size(), // size
condenser.RESULT_NUMB_WORDS, // word count
plasmaURL.docType(document.getMimeType()), // doctype
new bitfield(4), // flags
plasmaURL.language(entry.url()), // language
plasmaURL.docType(document.getMimeType()), // doctype
condenser.RESULT_FLAGS, // flags
plasmaURL.language(entry.url()), // language
0,0,0,0,0,0
);
/* ========================================================================
@ -1672,7 +1672,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wentry = (Map.Entry) i.next();
String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = plasmaURL.word2hash(word);
String wordHash = plasmaCondenser.word2hash(word);
indexRWIEntry wordIdxEntry = wordIndex.newRWIEntry(
urlHash,
urlLength, urlComps,
@ -1692,7 +1692,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
doctype,
ioLinks[0].intValue(),
ioLinks[1].intValue(),
true
condenser.RESULT_FLAGS
);
indexContainer wordIdxContainer = wordIndex.emptyContainer(wordHash);
wordIdxContainer.add(wordIdxEntry);
@ -1764,7 +1764,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
} else {
log.logFine("Not Indexed Resource '" + entry.normalizedURLString() + "': process case=" + processCase);
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new bitfield());
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNKNOWN_INDEXING_PROCESS_CASE, new kelondroBitfield());
}
} catch (Exception ee) {
if (ee instanceof InterruptedException) throw (InterruptedException)ee;
@ -1776,7 +1776,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
yacyClient.crawlReceipt(initiatorPeer, "crawl", "exception", ee.getMessage(), null, "");
}
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new bitfield());
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, plasmaCrawlEURL.DENIED_UNSPECIFIED_INDEXING_ERROR, new kelondroBitfield());
}
} else {
@ -1784,7 +1784,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logInfo("Not indexed any word in URL " + entry.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new bitfield());
addURLtoErrorDB(entry.url(), referrerUrlHash, initiatorPeerHash, docDescription, noIndexReason, new kelondroBitfield());
if ((processCase == PROCESSCASE_6_GLOBAL_CRAWLING) && (initiatorPeer != null)) {
yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
}
@ -2265,7 +2265,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
while (iter.hasNext()) {
word = (String) iter.next();
// delete the URL reference in this word index
if (wordIndex.removeEntry(plasmaURL.word2hash(word), urlhash, true)) count++;
if (wordIndex.removeEntry(plasmaCondenser.word2hash(word), urlhash, true)) count++;
}
return count;
}
@ -2280,7 +2280,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
entry = (Map.Entry) wordStatPropIterator.next();
word = (String) entry.getKey();
// delete the URL reference in this word index
if (wordIndex.removeEntry(plasmaURL.word2hash(word), urlhash, true)) count++;
if (wordIndex.removeEntry(plasmaCondenser.word2hash(word), urlhash, true)) count++;
}
return count;
}
@ -2516,7 +2516,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String initiator,
String name,
String failreason,
bitfield flags
kelondroBitfield flags
) {
// create a new errorURL DB entry
plasmaCrawlEURL.Entry ee = this.urlPool.errorURL.newEntry(

@ -665,11 +665,6 @@ public class plasmaURL {
// local flag attributes
public static final char LT_LOCAL = 'L';
public static final char LT_GLOBAL = 'G';
// create a word hash
public static String word2hash(String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength);
}
// doctype calculation
public static char docType(URL url) {

@ -47,6 +47,7 @@ import de.anomic.index.indexRWIEntryNew;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMergeIterator;
import de.anomic.kelondro.kelondroOrder;
@ -133,15 +134,15 @@ public final class plasmaWordIndex implements indexRI {
char doctype,
int outlinksSame,
int outlinksOther,
boolean local ) {
kelondroBitfield flags ) {
if (useCollectionIndex)
return new indexRWIEntryNew(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount,
posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype,
outlinksSame, outlinksOther, local);
outlinksSame, outlinksOther, flags);
else
return new indexRWIEntryOld(urlHash, urlLength, urlComps, titleLength, hitcount, wordcount, phrasecount,
posintext, posinphrase, posofphrase, worddistance, sizeOfPage, lastmodified, updatetime, quality, language, doctype,
outlinksSame, outlinksOther, local);
outlinksSame, outlinksOther, false);
}
public File getRoot() {
@ -381,7 +382,7 @@ public final class plasmaWordIndex implements indexRI {
word = (String) wentry.getKey();
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaURL.word2hash(word);
wordHash = plasmaCondenser.word2hash(word);
ientry = newRWIEntry(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
wprop.count,
@ -398,7 +399,7 @@ public final class plasmaWordIndex implements indexRI {
language,
doctype,
outlinksSame, outlinksOther,
true);
condenser.RESULT_FLAGS);
addEntry(wordHash, ientry, System.currentTimeMillis(), false);
}
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +

@ -61,7 +61,9 @@ import de.anomic.index.indexRWIEntryOld;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchTimingProfile;
@ -381,7 +383,8 @@ public final class yacyClient {
plasmaURLPattern blacklist,
plasmaSnippetCache snippets,
plasmaSearchTimingProfile timingProfile,
plasmaSearchRankingProfile rankingProfile
plasmaSearchRankingProfile rankingProfile,
kelondroBitfield constraint
) {
// send a search request to peer with remote Hash
// this mainly converts the words into word hashes
@ -434,6 +437,7 @@ public final class yacyClient {
obj.put("profile", timingProfile.targetToString()); // new duetimes splitted by specific search tasks
obj.put("maxdist", maxDistance);
obj.put("rankingProfile", rankingProfile.toExternalString());
obj.put("constraint", constraint.exportB64());
obj.put(yacySeed.MYTIME, yacyCore.universalDateShortString(new Date()));
if (abstractCache != null) obj.put("abstracts", "auto");
@ -534,7 +538,7 @@ public final class yacyClient {
urlEntry.language(),
urlEntry.doctype(),
0,0,
false
new kelondroBitfield(4)
);
} else {
// the new way: the search-result-url transports all the attributes of word indexes
@ -1206,7 +1210,7 @@ public final class yacyClient {
/*final yacyCore core =*/ new yacyCore(sb);
yacyCore.peerActions.loadSeedLists();
final yacySeed target = yacyCore.seedDB.getConnected(args[1]);
final String wordhashe = plasmaURL.word2hash("test");
final String wordhashe = plasmaCondenser.word2hash("test");
//System.out.println("permission=" + permissionMessage(args[1]));
// should we use the proxy?

@ -51,6 +51,7 @@ import java.util.Set;
import java.util.TreeMap;
import de.anomic.index.indexContainer;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchQuery;
@ -77,12 +78,14 @@ public class yacySearch extends Thread {
final private plasmaSearchTimingProfile timingProfile;
final private plasmaSearchRankingProfile rankingProfile;
final private String prefer, filter;
final private kelondroBitfield constraint;
public yacySearch(String wordhashes, String urlhashes, String prefer, String filter, int maxDistance,
boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex,
indexContainer containerCache, Map abstractCache,
plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) {
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile,
kelondroBitfield constraint) {
super("yacySearch_" + targetPeer.getName());
System.out.println("DEBUG - yacySearch thread " + this.getName() + " initialized " + ((urlhashes.length() == 0) ? "(primary)" : "(secondary)"));
this.wordhashes = wordhashes;
@ -101,10 +104,11 @@ public class yacySearch extends Thread {
this.maxDistance = maxDistance;
this.timingProfile = (plasmaSearchTimingProfile) timingProfile.clone();
this.rankingProfile = rankingProfile;
this.constraint = constraint;
}
public void run() {
this.urls = yacyClient.search(wordhashes, urlhashes, prefer, filter, maxDistance, global, targetPeer, urlManager, wordIndex, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile);
this.urls = yacyClient.search(wordhashes, urlhashes, prefer, filter, maxDistance, global, targetPeer, urlManager, wordIndex, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile, constraint);
if (urls != null) {
StringBuffer urllist = new StringBuffer(this.urls.length * 13);
for (int i = 0; i < this.urls.length; i++) urllist.append(this.urls[i]).append(' ');
@ -201,7 +205,8 @@ public class yacySearch extends Thread {
plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex,
indexContainer containerCache, Map abstractCache,
int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) {
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile,
kelondroBitfield constraint) {
// check own peer status
if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; }
@ -214,7 +219,7 @@ public class yacySearch extends Thread {
yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) {
searchThreads[i]= new yacySearch(wordhashes, urlhashes, prefer, filter, maxDist, true, targetPeers[i],
urlManager, wordIndex, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile);
urlManager, wordIndex, containerCache, abstractCache, blacklist, snippetCache, timingProfile, rankingProfile, constraint);
searchThreads[i].start();
//try {Thread.sleep(20);} catch (InterruptedException e) {}
}
@ -225,7 +230,8 @@ public class yacySearch extends Thread {
plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex,
indexContainer containerCache,
String targethash, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache,
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile) {
plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile,
kelondroBitfield constraint) {
// check own peer status
if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return null; }
@ -234,8 +240,8 @@ public class yacySearch extends Thread {
final yacySeed targetPeer = yacyCore.seedDB.getConnected(targethash);
if (targetPeer == null) return null;
yacySearch searchThread = new yacySearch(wordhashes, urlhashes, "", "", 9999, true, targetPeer,
urlManager, wordIndex, containerCache, new TreeMap(), blacklist, snippetCache, timingProfile, rankingProfile);
searchThread.start();
urlManager, wordIndex, containerCache, new TreeMap(), blacklist, snippetCache, timingProfile, rankingProfile, constraint);
searchThread.start();
return searchThread;
}

@ -75,17 +75,18 @@ import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexRWIEntryOld;
import de.anomic.index.indexURLEntry;
import de.anomic.index.indexURLEntryOld;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaURLPool;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexAssortmentCluster;
@ -98,7 +99,6 @@ import de.anomic.server.serverPlainSwitch;
import de.anomic.server.serverSwitch;
import de.anomic.server.serverSystem;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.bitfield;
import de.anomic.tools.enumerateFiles;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
@ -808,7 +808,7 @@ public final class yacy {
try {
String word;
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
while ((word = br.readLine()) != null) wordmap.put(plasmaURL.word2hash(word),word);
while ((word = br.readLine()) != null) wordmap.put(plasmaCondenser.word2hash(word),word);
br.close();
} catch (IOException e) {}
return wordmap;
@ -913,7 +913,7 @@ public final class yacy {
Iterator i = stopwords.iterator();
while (i.hasNext()) {
w = (String) i.next();
f = plasmaWordIndexFile.wordHash2path(dbRoot, plasmaURL.word2hash(w));
f = plasmaWordIndexFile.wordHash2path(dbRoot, plasmaCondenser.word2hash(w));
if (f.exists()) {
thisamount = f.length();
if (f.delete()) {
@ -1173,7 +1173,7 @@ public final class yacy {
oldentry.size(),
oldentry.wordCount(),
oldentry.doctype(),
new bitfield(4),
new kelondroBitfield(4),
oldentry.language(),
0, 0, 0, 0, 0, 0);
pool.loadedURL.store(newentry);

@ -818,10 +818,10 @@ currentSkin=
# temporary flag for new database structure. set only true for testing
# ALL DATA THAT IS CREATED WITH THIS FLAG ON WILL BE VOID IN A FINAL VERSION
# table-types: RAM = 0, TREE = 1, FLEX = 2;
useCollectionIndex=false
useFlexTableForNURL=false
useCollectionIndex=true
useFlexTableForNURL=true
useFlexTableForEURL=true
useFlexTableForLURL=false
useFlexTableForLURL=true
tableTypeForPreNURL=2
# flag to show surftipps on index.html page

Loading…
Cancel
Save