Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

pull/1/head
sof 13 years ago
commit b09988c9f8

@ -762,6 +762,9 @@ search.navigation=hosts,authors,namespace,topics,filetype,protocol
all search results are valid without verification
search.verify = iffresh
search.excludehosts=
search.excludehosth=
# in case that a link verification fails then the corresponding index reference can be
# deleted to clean up the index. If this property is set then failed index verification in
# the cases of nocache, iffresh and ifexist causes an index deletion

@ -197,14 +197,13 @@ public class Bookmarks {
// try to get the bookmark from the LURL database
final URIMetadataRow urlentry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(ASCII.getBytes(urlHash));
if (urlentry != null) try {
final URIMetadataRow.Components metadata = urlentry.metadata();
final Document document = Document.mergeDocuments(metadata.url(), null, sb.loader.loadDocuments(sb.loader.request(metadata.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE));
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE));
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", metadata.url().toNormalform(false, true));
prop.putHTML("mode_title", metadata.dc_title());
prop.putHTML("mode_description", (document == null) ? metadata.dc_title(): document.dc_title());
prop.putHTML("mode_author", metadata.dc_creator());
prop.putHTML("mode_tags", (document == null) ? metadata.dc_subject() : document.dc_subject(','));
prop.put("mode_url", urlentry.url().toNormalform(false, true));
prop.putHTML("mode_title", urlentry.dc_title());
prop.putHTML("mode_description", (document == null) ? urlentry.dc_title(): document.dc_title());
prop.putHTML("mode_author", urlentry.dc_creator());
prop.putHTML("mode_tags", (document == null) ? urlentry.dc_subject() : document.dc_subject(','));
prop.putHTML("mode_path","");
prop.put("mode_public", "0");
prop.put("mode_feed", "0"); //TODO: check if it IS a feed

@ -32,9 +32,11 @@
<li>
<img src="/env/grafics/ok.png" height="16" width="16" alt="ok" />&nbsp;Select a language for the interface:<br />
<fieldset>
<input type="radio" name="language" id="lang_de" value="de" onchange="this.form.submit()" #(langDeutsch)#::checked="checked"#(/langDeutsch)# /><label for="lang_de">Deutsch</label>&nbsp;
<input type="radio" name="language" id="lang_fr" value="fr" onchange="this.form.submit()" #(langFrancais)#::checked="checked"#(/langFrancais)# /><label for="lang_fr">Fran&ccedil;ais</label>&nbsp;
<input type="radio" name="language" value="default" id="lang_en" onchange="this.form.submit()" #(langEnglish)#::checked="checked"#(/langEnglish)# /><label for="lang_en">English</label>
<input type="radio" name="language" value="de" id="lang_de" onchange="this.form.submit()" #(lang_de)#::checked="checked"#(/lang_de)# /><label for="lang_de">Deutsch</label>&nbsp;
<input type="radio" name="language" value="fr" id="lang_fr" onchange="this.form.submit()" #(lang_fr)#::checked="checked"#(/lang_fr)# /><label for="lang_fr">Fran&ccedil;ais</label>&nbsp;
<input type="radio" name="language" value="cn" id="lang_cn" onchange="this.form.submit()" #(lang_cn)#::checked="checked"#(/lang_cn)# /><label for="lang_cn">&#27721;&#35821;/&#28450;&#35486</label>
<input type="radio" name="language" value="ru" id="lang_ru" onchange="this.form.submit()" #(lang_ru)#::checked="checked"#(/lang_ru)# /><label for="lang_ru">Russian</label>
<input type="radio" name="language" value="default" id="lang_en" onchange="this.form.submit()" #(lang_en)#::checked="checked"#(/lang_en)# /><label for="lang_en">English</label>
</fieldset>
</li>
<!-- take care that no other items are changed, but also change the former if no js is enabled -->

@ -263,22 +263,15 @@ public class ConfigBasic {
prop.putHTML("defaultName", sb.peers.mySeed().getName());
prop.putHTML("defaultPort", env.getConfig("port", "8090"));
lang = env.getConfig("locale.language", "default"); // re-assign lang, may have changed
prop.put("lang_de", "0");
prop.put("lang_fr", "0");
prop.put("lang_cn", "0");
prop.put("lang_ru", "0");
prop.put("lang_en", "0");
if ("default".equals(lang)) {
prop.put("langDeutsch", "0");
prop.put("langFrancais", "0");
prop.put("langEnglish", "1");
} else if ("fr".equals(lang)) {
prop.put("langDeutsch", "0");
prop.put("langFrancais", "1");
prop.put("langEnglish", "0");
} else if ("de".equals(lang)) {
prop.put("langDeutsch", "1");
prop.put("langFrancais", "0");
prop.put("langEnglish", "0");
prop.put("lang_en", "1");
} else {
prop.put("langDeutsch", "0");
prop.put("langFrancais", "0");
prop.put("langEnglish", "0");
prop.put("lang_" + lang, "1");
}
return prop;
}

@ -109,12 +109,18 @@
</select>
</dd>
<dt>Exclude Hosts</dt>
<dd>List of hosts that shall be excluded from search results by default but can be included using the site:&lt;host&gt; operator:<br/>
<input type="text" name="search.excludehosts" value="#[search.excludehosts]#" size="60" /><br/>
#[search.excludehosth]#
</dd>
<dt>'About' Column<br/>(shown in a column alongside<br/>with the search result page)</dt>
<dd><input type="text" name="about.headline" value="#[about.headline]#" size="60" />(Headline)</br>
<textarea name="about.body" cols="60" rows="8">#[about.body]#</textarea>(Content)
</dd>
<dt>&nbsp;</dt>
<dt>&nbsp;</dt>
<dd>
<input type="submit" name="searchpage_set" value="Change Search Page" />&nbsp;&nbsp;
<input type="submit" name="searchpage_default" value="Set to Default Values" />

@ -26,6 +26,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import de.anomic.data.WorkTables;
@ -93,6 +94,10 @@ public class ConfigPortal {
sb.setConfig("about.headline", post.get("about.headline", ""));
sb.setConfig("about.body", post.get("about.body", ""));
String excludehosts = post.get("search.excludehosts", "");
sb.setConfig("search.excludehosts", excludehosts);
sb.setConfig("search.excludehosth", DigestURI.hosthashes(excludehosts));
// construct navigation String
String nav = "";
if (post.getBoolean("search.navigation.hosts", false)) nav += "hosts,";
@ -126,8 +131,10 @@ public class ConfigPortal {
sb.setConfig("search.result.show.pictures", false);
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, "iffresh");
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, "true");
prop.put("about.headline", "");
prop.put("about.body", "");
sb.setConfig("about.headline", "");
sb.setConfig("about.body", "");
sb.setConfig("search.excludehosts", "");
sb.setConfig("search.excludehosth", "");
}
}
@ -167,6 +174,9 @@ public class ConfigPortal {
prop.put("about.headline", sb.getConfig("about.headline", ""));
prop.put("about.body", sb.getConfig("about.body", ""));
prop.put("search.excludehosts", sb.getConfig("search.excludehosts", ""));
prop.put("search.excludehosth", sb.getConfig("search.excludehosth", ""));
final String browserPopUpPage = sb.getConfig(SwitchboardConstants.BROWSER_POP_UP_PAGE, "ConfigBasic.html");
prop.put("popupFront", 0);
prop.put("popupSearch", 0);

@ -180,7 +180,6 @@ public class CrawlResults {
String urlstr, urltxt;
Seed initiatorSeed, executorSeed;
URIMetadataRow urle;
URIMetadataRow.Components metadata;
int cnt = 0;
final Iterator<Map.Entry<String, InitExecEntry>> i = ResultURLs.results(tabletype);
@ -193,11 +192,9 @@ public class CrawlResults {
Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey());
urlstr = null;
urltxt = null;
metadata = null;
continue;
}
metadata = urle.metadata();
urlstr = metadata.url().toNormalform(false, true);
urlstr = urle.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
initiatorSeed = entry.getValue() == null || entry.getValue().initiatorHash == null ? null : sb.peers.getConnected(ASCII.String(entry.getValue().initiatorHash));
@ -236,11 +233,11 @@ public class CrawlResults {
prop.put("table_indexed_" + cnt + "_showTitle", (showTitle) ? "1" : "0");
prop.put("table_indexed_" + cnt + "_showTitle_available", "1");
if (metadata == null || metadata.dc_title() == null || metadata.dc_title().trim().length() == 0)
if (urle.dc_title() == null || urle.dc_title().trim().length() == 0)
prop.put("table_indexed_" + cnt + "_showTitle_available_nodescr", "0");
else {
prop.put("table_indexed_" + cnt + "_showTitle_available_nodescr", "1");
prop.putHTML("table_indexed_" + cnt + "_showTitle_available_nodescr_urldescr", metadata.dc_title());
prop.putHTML("table_indexed_" + cnt + "_showTitle_available_nodescr_urldescr", urle.dc_title());
}
prop.put("table_indexed_" + cnt + "_showTitle_available_urlHash", entry.getKey());
@ -250,13 +247,13 @@ public class CrawlResults {
if (showCountry && urle != null) {
prop.put("table_indexed_" + cnt + "_showCountry", "1");
prop.put("table_indexed_" + cnt + "_showCountry_country", metadata.url().getLocale().getCountry());
prop.put("table_indexed_" + cnt + "_showCountry_country", urle.url().getLocale().getCountry());
} else
prop.put("table_indexed_" + cnt + "_showCountry", "0");
if (showIP && urle != null) {
prop.put("table_indexed_" + cnt + "_showIP", "1");
prop.put("table_indexed_" + cnt + "_showIP_ip", metadata.url().getInetAddress().getHostAddress());
prop.put("table_indexed_" + cnt + "_showIP_ip", urle.url().getInetAddress().getHostAddress());
} else
prop.put("table_indexed_" + cnt + "_showIP", "0");

@ -377,7 +377,7 @@ public class IndexControlRWIs_p
if ( post.containsKey("keyhashsimilar") ) {
try {
final Iterator<ReferenceContainer<WordReference>> containerIt =
segment.termIndex().referenceContainer(keyhash, true, 256, false).iterator();
segment.termIndex().referenceContainer(keyhash, true, false, 256, false).iterator();
ReferenceContainer<WordReference> container;
i = 0;
int rows = 0, cols = 0;
@ -427,7 +427,7 @@ public class IndexControlRWIs_p
final URIMetadataRow e = segment.urlMetadata().load(b);
segment.urlMetadata().remove(b);
if ( e != null ) {
url = e.metadata().url();
url = e.url();
pw.println(url.getHost() + "/" + url.getFile());
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
if ( ListManager.listSetContains(
@ -463,7 +463,7 @@ public class IndexControlRWIs_p
final URIMetadataRow e = segment.urlMetadata().load(b);
segment.urlMetadata().remove(b);
if ( e != null ) {
url = e.metadata().url();
url = e.url();
pw.println(url.getHost() + "/.*");
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
if ( ListManager.listSetContains(
@ -530,10 +530,7 @@ public class IndexControlRWIs_p
String us;
long rn = -1;
while ( !ranked.isEmpty() && (entry = ranked.takeURL(false, 1000)) != null ) {
if ( (entry == null) || (entry.metadata() == null) ) {
continue;
}
url = entry.metadata().url();
url = entry.url();
if ( url == null ) {
continue;
}

@ -158,7 +158,7 @@ public class IndexControlURLs_p {
if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
urlstring = entry.metadata().url().toNormalform(false, true);
urlstring = entry.url().toNormalform(false, true);
prop.put("urlstring", "");
sb.urlRemove(segment, urlhash.getBytes());
prop.putHTML("result", "Removed URL " + urlstring);
@ -210,7 +210,7 @@ public class IndexControlURLs_p {
if (entry == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash);
} else {
prop.putHTML("urlstring", entry.metadata().url().toNormalform(false, true));
prop.putHTML("urlstring", entry.url().toNormalform(false, true));
prop.putAll(genUrlProfile(segment, entry, urlhash));
prop.put("statistics", 0);
}
@ -333,21 +333,20 @@ public class IndexControlURLs_p {
prop.put("genUrlProfile_urlhash", urlhash);
return prop;
}
final URIMetadataRow.Components metadata = entry.metadata();
final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
if (metadata == null || metadata.url() == null) {
if (entry.url() == null) {
prop.put("genUrlProfile", "1");
prop.put("genUrlProfile_urlhash", urlhash);
return prop;
}
prop.put("genUrlProfile", "2");
prop.putHTML("genUrlProfile_urlNormalform", metadata.url().toNormalform(false, true));
prop.putHTML("genUrlProfile_urlNormalform", entry.url().toNormalform(false, true));
prop.put("genUrlProfile_urlhash", urlhash);
prop.put("genUrlProfile_urlDescr", metadata.dc_title());
prop.put("genUrlProfile_urlDescr", entry.dc_title());
prop.put("genUrlProfile_moddate", entry.moddate().toString());
prop.put("genUrlProfile_loaddate", entry.loaddate().toString());
prop.put("genUrlProfile_referrer", (le == null) ? 0 : 1);
prop.putHTML("genUrlProfile_referrer_url", (le == null) ? "<unknown>" : le.metadata().url().toNormalform(false, true));
prop.putHTML("genUrlProfile_referrer_url", (le == null) ? "<unknown>" : le.url().toNormalform(false, true));
prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : ASCII.String(le.hash()));
prop.put("genUrlProfile_doctype", String.valueOf(entry.doctype()));
prop.put("genUrlProfile_language", entry.language());

@ -34,8 +34,8 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.ranking.ConcurrentScoreMap;
import net.yacy.cora.ranking.ScoreMap;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.Row.Entry;

@ -33,8 +33,8 @@ import java.util.Iterator;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.ranking.ConcurrentScoreMap;
import net.yacy.cora.ranking.ScoreMap;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.Row.Entry;

@ -117,14 +117,13 @@ public class ViewFile {
// get the urlEntry that belongs to the url hash
if (urlHash.length() > 0 && (urlEntry = indexSegment.urlMetadata().load(ASCII.getBytes(urlHash))) != null) {
// get the url that belongs to the entry
final URIMetadataRow.Components metadata = urlEntry.metadata();
if ((metadata == null) || (metadata.url() == null)) {
if (urlEntry == null || urlEntry.url() == null) {
prop.put("error", "3");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
url = metadata.url();
descr = metadata.dc_title();
url = urlEntry.url();
descr = urlEntry.dc_title();
//urlEntry.wordCount();
size = urlEntry.size();
pre = urlEntry.flags().get(Condenser.flag_cat_indexof);

@ -27,7 +27,7 @@ import java.util.Iterator;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.ranking.Rating;
import net.yacy.cora.sorting.Rating;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
@ -50,7 +50,7 @@ public class termlist_p {
segment = sb.indexSegments.segment(post.get("segment"));
}
if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
final Iterator<Rating<byte[]>> i = segment.termIndex().referenceCountIterator(null, false);
final Iterator<Rating<byte[]>> i = segment.termIndex().referenceCountIterator(null, false, false);
Rating<byte[]> e;
int c = 0, termnumber = 0;
byte[] termhash, maxterm = null;

@ -87,29 +87,28 @@ public class yacydoc {
final URIMetadataRow entry = segment.urlMetadata().load(urlhash.getBytes());
if (entry == null) return prop;
final URIMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) {
if (entry.url() == null) {
return prop;
}
final URIMetadataRow le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.urlMetadata().load(entry.referrerHash());
prop.putXML("dc_title", metadata.dc_title());
prop.putXML("dc_creator", metadata.dc_creator());
prop.putXML("dc_title", entry.dc_title());
prop.putXML("dc_creator", entry.dc_creator());
prop.putXML("dc_description", ""); // this is the fulltext part in the surrogate
prop.putXML("dc_subject", metadata.dc_subject());
prop.putXML("dc_publisher", metadata.dc_publisher());
prop.putXML("dc_subject", entry.dc_subject());
prop.putXML("dc_publisher", entry.dc_publisher());
prop.putXML("dc_contributor", "");
prop.putXML("dc_date", ISO8601Formatter.FORMATTER.format(entry.moddate()));
prop.putXML("dc_type", String.valueOf(entry.doctype()));
prop.putXML("dc_identifier", metadata.url().toNormalform(false, true));
prop.putXML("dc_identifier", entry.url().toNormalform(false, true));
prop.putXML("dc_language", ASCII.String(entry.language()));
prop.put("geo_lat", metadata.lat());
prop.put("geo_long", metadata.lon());
prop.put("geo_lat", entry.lat());
prop.put("geo_long", entry.lon());
prop.put("yacy_urlhash", metadata.url().hash());
prop.put("yacy_urlhash", entry.url().hash());
prop.putXML("yacy_loaddate", entry.loaddate().toString());
prop.putXML("yacy_referrer_hash", (le == null) ? "" : ASCII.String(le.hash()));
prop.putXML("yacy_referrer_url", (le == null) ? "" : le.metadata().url().toNormalform(false, true));
prop.putXML("yacy_referrer_url", (le == null) ? "" : le.url().toNormalform(false, true));
prop.put("yacy_size", entry.size());
prop.put("yacy_words",entry.wordCount());

@ -35,7 +35,7 @@ public class add_ymark {
if(post.containsKey("urlHash")) {
final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
final DigestURI url = sb.indexSegments.segment(Segments.Process.PUBLIC).urlMetadata().load(urlHash.getBytes()).metadata().url();
final DigestURI url = sb.indexSegments.segment(Segments.Process.PUBLIC).urlMetadata().load(urlHash.getBytes()).url();
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
try {

@ -33,8 +33,8 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.Seed;
import net.yacy.peers.Protocol;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments;
import de.anomic.crawler.ResultURLs;
@ -125,15 +125,14 @@ public final class crawlReceipt {
return prop;
}
final URIMetadataRow.Components metadata = entry.metadata();
if (metadata.url() == null) {
if (entry.url() == null) {
if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + ASCII.String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "3600");
return prop;
}
// check if the entry is in our network domain
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(metadata.url());
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(entry.url());
if (urlRejectReason != null) {
if (log.isWarning()) log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (" + urlRejectReason + ") for hash " + ASCII.String(entry.hash()) + " from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "9999");
@ -145,7 +144,7 @@ public final class crawlReceipt {
sb.indexSegments.urlMetadata(Segments.Process.RECEIPTS).store(entry);
ResultURLs.stack(entry, youare.getBytes(), iam.getBytes(), EventOrigin.REMOTE_RECEIPTS);
sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work has been done
if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + metadata.url().toNormalform(false, true));
if (log.isInfo()) log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + ASCII.String(entry.hash()) + ":" + entry.url().toNormalform(false, true));
// ready for more
prop.put("delay", "10");

@ -41,9 +41,9 @@ import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.ranking.ScoreMap;
import net.yacy.cora.ranking.WeakPriorityBlockingQueue;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
@ -244,6 +244,7 @@ public final class search {
null,
false,
sitehash,
null,
authorhash,
DigestURI.TLD_any_zone_filter,
client,
@ -305,6 +306,7 @@ public final class search {
constraint,
false,
sitehash,
null,
authorhash,
DigestURI.TLD_any_zone_filter,
client,

@ -34,10 +34,10 @@ import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.peers.Seed;
import net.yacy.peers.EventChannel;
import net.yacy.peers.Protocol;
import net.yacy.peers.Network;
import net.yacy.peers.Protocol;
import net.yacy.peers.Seed;
import net.yacy.repository.Blacklist;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments;
@ -108,8 +108,7 @@ public final class transferURL {
}
// check if entry is well-formed
final URIMetadataRow.Components metadata = lEntry.metadata();
if (metadata == null || metadata.url() == null) {
if (lEntry.url() == null) {
Network.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls);
blocked++;
continue;
@ -123,28 +122,28 @@ public final class transferURL {
}
// check if the entry is blacklisted
if ((blockBlacklist) && (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, metadata.url()))) {
if (Network.log.isFine()) Network.log.logFine("transferURL: blocked blacklisted URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName);
if ((blockBlacklist) && (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_DHT, lEntry.url()))) {
if (Network.log.isFine()) Network.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url().toNormalform(false, true) + "' from peer " + otherPeerName);
lEntry = null;
blocked++;
continue;
}
// check if the entry is in our network domain
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(metadata.url());
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomain(lEntry.url());
if (urlRejectReason != null) {
if (Network.log.isFine()) Network.log.logFine("transferURL: blocked URL '" + metadata.url() + "' (" + urlRejectReason + ") from peer " + otherPeerName);
if (Network.log.isFine()) Network.log.logFine("transferURL: blocked URL '" + lEntry.url() + "' (" + urlRejectReason + ") from peer " + otherPeerName);
lEntry = null;
blocked++;
continue;
}
// write entry to database
if (Network.log.isFine()) Network.log.logFine("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.metadata().url().toNormalform(true, false));
if (Network.log.isFine()) Network.log.logFine("Accepting URL " + i + "/" + urlc + " from peer " + otherPeerName + ": " + lEntry.url().toNormalform(true, false));
try {
sb.indexSegments.urlMetadata(Segments.Process.DHTIN).store(lEntry);
ResultURLs.stack(lEntry, iam.getBytes(), iam.getBytes(), EventOrigin.DHT_TRANSFER);
if (Network.log.isFine()) Network.log.logFine("transferURL: received URL '" + metadata.url().toNormalform(false, true) + "' from peer " + otherPeerName);
if (Network.log.isFine()) Network.log.logFine("transferURL: received URL '" + lEntry.url().toNormalform(false, true) + "' from peer " + otherPeerName);
received++;
} catch (final IOException e) {
Log.logException(e);

@ -112,7 +112,6 @@ public class urls {
final int count = urlhashes.length() / 12;
int c = 0;
URIMetadataRow entry;
URIMetadataRow.Components metadata;
DigestURI referrer;
for (int i = 0; i < count; i++) {
entry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(ASCII.getBytes(urlhashes.substring(12 * i, 12 * (i + 1))));
@ -120,12 +119,11 @@ public class urls {
// find referrer, if there is one
referrer = sb.getURL(Segments.Process.PUBLIC, entry.referrerHash());
// create RSS entry
metadata = entry.metadata();
prop.put("item_" + c + "_title", metadata.dc_title());
prop.putXML("item_" + c + "_link", metadata.url().toNormalform(true, false));
prop.put("item_" + c + "_title", entry.dc_title());
prop.putXML("item_" + c + "_link", entry.url().toNormalform(true, false));
prop.putXML("item_" + c + "_referrer", (referrer == null) ? "" : referrer.toNormalform(true, false));
prop.putXML("item_" + c + "_description", metadata.dc_title());
prop.put("item_" + c + "_author", metadata.dc_creator());
prop.putXML("item_" + c + "_description", entry.dc_title());
prop.put("item_" + c + "_author", entry.dc_creator());
prop.put("item_" + c + "_pubDate", GenericFormatter.SHORT_SECOND_FORMATTER.format(entry.moddate()));
prop.put("item_" + c + "_guid", ASCII.String(entry.hash()));
c++;

@ -534,18 +534,17 @@ public class yacysearch {
final String recommendHash = post.get("recommendref", ""); // urlhash
final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(recommendHash));
if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata();
Document[] documents = null;
try {
documents = sb.loader.loadDocuments(sb.loader.request(metadata.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE);
documents = sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE);
} catch (final IOException e) {
} catch (final Parser.Failure e) {
}
if (documents != null) {
// create a news message
final Map<String, String> map = new HashMap<String, String>();
map.put("url", metadata.url().toNormalform(false, true).replace(',', '|'));
map.put("title", metadata.dc_title().replace(',', ' '));
map.put("url", urlentry.url().toNormalform(false, true).replace(',', '|'));
map.put("title", urlentry.dc_title().replace(',', ' '));
map.put("description", documents[0].dc_title().replace(',', ' '));
map.put("author", documents[0].dc_creator());
map.put("tags", documents[0].dc_subject(' '));
@ -564,9 +563,8 @@ public class yacysearch {
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
final URIMetadataRow urlentry = indexSegment.urlMetadata().load(UTF8.getBytes(bookmarkHash));
if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata();
try {
sb.tables.bookmarks.createBookmark(sb.loader, metadata.url(), YMarkTables.USER_ADMIN, true, "searchresult", "/search");
sb.tables.bookmarks.createBookmark(sb.loader, urlentry.url(), YMarkTables.USER_ADMIN, true, "searchresult", "/search");
} catch (final Throwable e) {
}
}
@ -618,6 +616,7 @@ public class yacysearch {
constraint,
true,
sitehash,
DigestURI.hosthashess(sb.getConfig("search.excludehosth", "")),
authorhash,
DigestURI.TLD_any_zone_filter,
client,

@ -27,7 +27,7 @@
import java.util.Iterator;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.ranking.ScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.document.LibraryProvider;
import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.Formatter;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -41,6 +41,7 @@ import java.util.concurrent.ConcurrentMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.BufferedObjectIndex;
@ -49,7 +50,6 @@ import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.table.Table;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.MemoryControl;

@ -34,8 +34,8 @@ import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.ranking.ClusteredScoreMap;
import net.yacy.cora.ranking.ScoreMap;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
@ -115,7 +115,7 @@ public final class ResultURLs {
try {
final ScoreMap<String> domains = getDomains(stackType);
if (domains != null) {
domains.inc(e.metadata().url().getHost());
domains.inc(e.url().getHost());
}
} catch (final Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/3: " + ex.toString());

@ -7,8 +7,8 @@ import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.ranking.ClusteredScoreMap;
import net.yacy.cora.ranking.ReversibleScoreMap;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.document.LibraryProvider;
import net.yacy.document.StringBuilderComparator;
import net.yacy.kelondro.data.word.Word;

@ -37,12 +37,12 @@ import java.util.Map;
import java.util.Random;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.FileUtils;

@ -81,7 +81,7 @@ public class YMarkMetadata {
public YMarkMetadata(final byte[] urlHash, final Segments indexSegment) {
this.document = null;
this.indexSegment = indexSegment;
this.uri = this.indexSegment.segment(Segments.Process.PUBLIC).urlMetadata().load(urlHash).metadata().url();
this.uri = this.indexSegment.segment(Segments.Process.PUBLIC).urlMetadata().load(urlHash).url();
}
public YMarkMetadata(final Document document) {
@ -115,14 +115,10 @@ public class YMarkMetadata {
metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount()));
metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype()));
metadata.put(METADATA.LANGUAGE, UTF8.String(urlEntry.language()));
final URIMetadataRow.Components meta = urlEntry.metadata();
if (meta != null) {
metadata.put(METADATA.TITLE, meta.dc_title());
metadata.put(METADATA.CREATOR, meta.dc_creator());
metadata.put(METADATA.KEYWORDS, meta.dc_subject());
metadata.put(METADATA.PUBLISHER, meta.dc_publisher());
}
metadata.put(METADATA.TITLE, urlEntry.dc_title());
metadata.put(METADATA.CREATOR, urlEntry.dc_creator());
metadata.put(METADATA.KEYWORDS, urlEntry.dc_subject());
metadata.put(METADATA.PUBLISHER, urlEntry.dc_publisher());
}
return metadata;
}

@ -0,0 +1,429 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.yacy.cora.language.phonetic;
import java.util.Locale;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
/**
* <p>
* Encodes a string into a Cologne Phonetic value.
* </p>
* <p>
* Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">K&ouml;lner Phonetik</a> (Cologne Phonetic)
* algorithm issued by Hans Joachim Postel in 1969.
* </p>
*
* <p>
* The <i>K&ouml;lner Phonetik</i> is a phonetic algorithm which is optimized for the German language. It is related to the
* well-known soundex algorithm.
* </p>
*
* <h2>Algorithm</h2>
*
* <ul>
*
* <li>
* <h3>Step 1:</h3>
* After preprocessing (conversion to upper case, transcription of <a
* href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the
* letters of the supplied text are replaced by their phonetic code according to the following table.
* <table border="1">
* <tbody>
* <tr>
* <th>Letter</th>
* <th>Context</th>
* <th align="center">Code</th>
* </tr>
* <tr>
* <td>A, E, I, J, O, U, Y</td>
* <td></td>
* <td align="center">0</td>
* </tr>
* <tr>
*
* <td>H</td>
* <td></td>
* <td align="center">-</td>
* </tr>
* <tr>
* <td>B</td>
* <td></td>
* <td rowspan="2" align="center">1</td>
* </tr>
* <tr>
* <td>P</td>
* <td>not before H</td>
*
* </tr>
* <tr>
* <td>D, T</td>
* <td>not before C, S, Z</td>
* <td align="center">2</td>
* </tr>
* <tr>
* <td>F, V, W</td>
* <td></td>
* <td rowspan="2" align="center">3</td>
* </tr>
* <tr>
*
* <td>P</td>
* <td>before H</td>
* </tr>
* <tr>
* <td>G, K, Q</td>
* <td></td>
* <td rowspan="3" align="center">4</td>
* </tr>
* <tr>
* <td rowspan="2">C</td>
* <td>at onset before A, H, K, L, O, Q, R, U, X</td>
*
* </tr>
* <tr>
* <td>before A, H, K, O, Q, U, X except after S, Z</td>
* </tr>
* <tr>
* <td>X</td>
* <td>not after C, K, Q</td>
* <td align="center">48</td>
* </tr>
* <tr>
* <td>L</td>
* <td></td>
*
* <td align="center">5</td>
* </tr>
* <tr>
* <td>M, N</td>
* <td></td>
* <td align="center">6</td>
* </tr>
* <tr>
* <td>R</td>
* <td></td>
* <td align="center">7</td>
* </tr>
*
* <tr>
* <td>S, Z</td>
* <td></td>
* <td rowspan="6" align="center">8</td>
* </tr>
* <tr>
* <td rowspan="3">C</td>
* <td>after S, Z</td>
* </tr>
* <tr>
* <td>at onset except before A, H, K, L, O, Q, R, U, X</td>
* </tr>
*
* <tr>
* <td>not before A, H, K, O, Q, U, X</td>
* </tr>
* <tr>
* <td>D, T</td>
* <td>before C, S, Z</td>
* </tr>
* <tr>
* <td>X</td>
* <td>after C, K, Q</td>
* </tr>
* </tbody>
* </table>
* <p>
* <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de):
* K&ouml;lner Phonetik -- Buchstabencodes</a>)</i></small>
* </p>
*
* <h4>Example:</h4>
*
* {@code "M}&uuml;{@code ller-L}&uuml;{@code denscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"}
*
* </li>
*
* <li>
* <h3>Step 2:</h3>
* Collapse of all multiple consecutive code digits.
* <h4>Example:</h4>
* {@code "6005507500206880022" => "6050750206802"}</li>
*
* <li>
* <h3>Step 3:</h3>
* Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur
* if they occur after removing the "0" digits.
*
* <h4>Example:</h4>
* {@code "6050750206802" => "65752682"}</li>
*
* </ul>
*
* @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): K&ouml;lner Phonetik (in German)</a>
* @author Apache Software Foundation
* @since 1.5
*/
public class ColognePhonetic implements StringEncoder {
private abstract class CologneBuffer {
protected final char[] data;
protected int length = 0;
public CologneBuffer(char[] data) {
this.data = data;
this.length = data.length;
}
public CologneBuffer(int buffSize) {
this.data = new char[buffSize];
this.length = 0;
}
protected abstract char[] copyData(int start, final int length);
public int length() {
return length;
}
@Override
public String toString() {
return new String(copyData(0, length));
}
}
private class CologneOutputBuffer extends CologneBuffer {
public CologneOutputBuffer(int buffSize) {
super(buffSize);
}
public void addRight(char chr) {
data[length] = chr;
length++;
}
@Override
protected char[] copyData(int start, final int length) {
char[] newData = new char[length];
System.arraycopy(data, start, newData, 0, length);
return newData;
}
}
private class CologneInputBuffer extends CologneBuffer {
public CologneInputBuffer(char[] data) {
super(data);
}
public void addLeft(char ch) {
length++;
data[getNextPos()] = ch;
}
@Override
protected char[] copyData(int start, final int length) {
char[] newData = new char[length];
System.arraycopy(data, data.length - this.length + start, newData, 0, length);
return newData;
}
public char getNextChar() {
return data[getNextPos()];
}
protected int getNextPos() {
return data.length - length;
}
public char removeNext() {
char ch = getNextChar();
length--;
return ch;
}
}
/**
* Maps some Germanic characters to plain for internal processing. The following characters are mapped:
* <ul>
* <li>capital a, umlaut mark</li>
* <li>capital u, umlaut mark</li>
* <li>capital o, umlaut mark</li>
* <li>small sharp s, German</li>
* </ul>
*/
private static final char[][] PREPROCESS_MAP = new char[][]{
{'\u00C4', 'A'}, // capital a, umlaut mark
{'\u00DC', 'U'}, // capital u, umlaut mark
{'\u00D6', 'O'}, // capital o, umlaut mark
{'\u00DF', 'S'} // small sharp s, German
};
/*
* Returns whether the array contains the key, or not.
*/
private static boolean arrayContains(char[] arr, char key) {
for (char element : arr) {
if (element == key) {
return true;
}
}
return false;
}
/**
* <p>
* Implements the <i>K&ouml;lner Phonetik</i> algorithm.
* </p>
* <p>
* In contrast to the initial description of the algorithm, this implementation does the encoding in one pass.
* </p>
*
* @param text
* @return the corresponding encoding according to the <i>K&ouml;lner Phonetik</i> algorithm
*/
public String colognePhonetic(String text) {
if (text == null) {
return null;
}
text = preprocess(text);
CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2);
CologneInputBuffer input = new CologneInputBuffer(text.toCharArray());
char nextChar;
char lastChar = '-';
char lastCode = '/';
char code;
char chr;
int rightLength = input.length();
while (rightLength > 0) {
chr = input.removeNext();
if ((rightLength = input.length()) > 0) {
nextChar = input.getNextChar();
} else {
nextChar = '-';
}
if (arrayContains(new char[]{'A', 'E', 'I', 'J', 'O', 'U', 'Y'}, chr)) {
code = '0';
} else if (chr == 'H' || chr < 'A' || chr > 'Z') {
if (lastCode == '/') {
continue;
}
code = '-';
} else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) {
code = '1';
} else if ((chr == 'D' || chr == 'T') && !arrayContains(new char[]{'S', 'C', 'Z'}, nextChar)) {
code = '2';
} else if (arrayContains(new char[]{'W', 'F', 'P', 'V'}, chr)) {
code = '3';
} else if (arrayContains(new char[]{'G', 'K', 'Q'}, chr)) {
code = '4';
} else if (chr == 'X' && !arrayContains(new char[]{'C', 'K', 'Q'}, lastChar)) {
code = '4';
input.addLeft('S');
rightLength++;
} else if (chr == 'S' || chr == 'Z') {
code = '8';
} else if (chr == 'C') {
if (lastCode == '/') {
if (arrayContains(new char[]{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, nextChar)) {
code = '4';
} else {
code = '8';
}
} else {
if (arrayContains(new char[]{'S', 'Z'}, lastChar) ||
!arrayContains(new char[]{'A', 'H', 'O', 'U', 'K', 'Q', 'X'}, nextChar)) {
code = '8';
} else {
code = '4';
}
}
} else if (arrayContains(new char[]{'T', 'D', 'X'}, chr)) {
code = '8';
} else if (chr == 'R') {
code = '7';
} else if (chr == 'L') {
code = '5';
} else if (chr == 'M' || chr == 'N') {
code = '6';
} else {
code = chr;
}
if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) {
output.addRight(code);
}
lastChar = chr;
lastCode = code;
}
return output.toString();
}
public Object encode(Object object) throws EncoderException {
if (!(object instanceof String)) {
throw new EncoderException("This method's parameter was expected to be of the type " +
String.class.getName() +
". But actually it was of the type " +
object.getClass().getName() +
".");
}
return encode((String) object);
}
public String encode(String text) {
return colognePhonetic(text);
}
public boolean isEncodeEqual(String text1, String text2) {
return colognePhonetic(text1).equals(colognePhonetic(text2));
}
/**
* Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}.
*/
private String preprocess(String text) {
text = text.toUpperCase(Locale.GERMAN);
char[] chrs = text.toCharArray();
for (int index = 0; index < chrs.length; index++) {
if (chrs[index] > 'Z') {
for (char[] element : PREPROCESS_MAP) {
if (chrs[index] == element[0]) {
chrs[index] = element[1];
break;
}
}
}
}
return new String(chrs);
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,407 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.yacy.cora.language.phonetic;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
/**
* Encodes a string into a Metaphone value.
* <p>
* Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
* Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
* </p>
* <p>
* <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
* 39.</CITE>
* </p>
* <p>
* Note, that this does not match the algorithm that ships with PHP, or the algorithm
* found in the Perl <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>.
* They have had undocumented changes from the originally published algorithm.
* For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
* </p>
*
* @author Apache Software Foundation
* @version $Id: Metaphone.java 1157192 2011-08-12 17:27:38Z ggregory $
*/
public class Metaphone implements StringEncoder {
/**
* Five values in the English language
*/
private static final String VOWELS = "AEIOU" ;
/**
* Variable used in Metaphone algorithm
*/
private static final String FRONTV = "EIY" ;
/**
* Variable used in Metaphone algorithm
*/
private static final String VARSON = "CSPTG" ;
/**
* The max code length for metaphone is 4
*/
private int maxCodeLen = 4 ;
/**
* Creates an instance of the Metaphone encoder
*/
public Metaphone() {
super();
}
/**
* Find the metaphone value of a String. This is similar to the
* soundex algorithm, but better at finding similar sounding words.
* All input is converted to upper case.
* Limitations: Input format is expected to be a single ASCII word
* with only characters in the A - Z range, no punctuation or numbers.
*
* @param txt String to find the metaphone code for
* @return A metaphone code corresponding to the String supplied
*/
public String metaphone(String txt) {
boolean hard = false ;
if ((txt == null) || (txt.length() == 0)) {
return "" ;
}
// single character is itself
if (txt.length() == 1) {
return txt.toUpperCase(java.util.Locale.ENGLISH) ;
}
char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ;
StringBuffer local = new StringBuffer(40); // manipulate
StringBuffer code = new StringBuffer(10) ; // output
// handle initial 2 characters exceptions
switch(inwd[0]) {
case 'K' :
case 'G' :
case 'P' : /* looking for KN, etc*/
if (inwd[1] == 'N') {
local.append(inwd, 1, inwd.length - 1);
} else {
local.append(inwd);
}
break;
case 'A': /* looking for AE */
if (inwd[1] == 'E') {
local.append(inwd, 1, inwd.length - 1);
} else {
local.append(inwd);
}
break;
case 'W' : /* looking for WR or WH */
if (inwd[1] == 'R') { // WR -> R
local.append(inwd, 1, inwd.length - 1);
break ;
}
if (inwd[1] == 'H') {
local.append(inwd, 1, inwd.length - 1);
local.setCharAt(0, 'W'); // WH -> W
} else {
local.append(inwd);
}
break;
case 'X' : /* initial X becomes S */
inwd[0] = 'S';
local.append(inwd);
break ;
default :
local.append(inwd);
} // now local has working string with initials fixed
int wdsz = local.length();
int n = 0 ;
while ((code.length() < this.getMaxCodeLen()) &&
(n < wdsz) ) { // max code size of 4 works well
char symb = local.charAt(n) ;
// remove duplicate letters except C
if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
n++ ;
} else { // not dup
switch(symb) {
case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
if (n == 0) {
code.append(symb);
}
break ; // only use vowel if leading char
case 'B' :
if ( isPreviousChar(local, n, 'M') &&
isLastChar(wdsz, n) ) { // B is silent if word ends in MB
break;
}
code.append(symb);
break;
case 'C' : // lots of C special cases
/* discard if SCI, SCE or SCY */
if ( isPreviousChar(local, n, 'S') &&
!isLastChar(wdsz, n) &&
(FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) {
break;
}
if (regionMatch(local, n, "CIA")) { // "CIA" -> X
code.append('X');
break;
}
if (!isLastChar(wdsz, n) &&
(FRONTV.indexOf(local.charAt(n + 1)) >= 0)) {
code.append('S');
break; // CI,CE,CY -> S
}
if (isPreviousChar(local, n, 'S') &&
isNextChar(local, n, 'H') ) { // SCH->sk
code.append('K') ;
break ;
}
if (isNextChar(local, n, 'H')) { // detect CH
if ((n == 0) &&
(wdsz >= 3) &&
isVowel(local,2) ) { // CH consonant -> K consonant
code.append('K');
} else {
code.append('X'); // CHvowel -> X
}
} else {
code.append('K');
}
break ;
case 'D' :
if (!isLastChar(wdsz, n + 1) &&
isNextChar(local, n, 'G') &&
(FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
code.append('J'); n += 2 ;
} else {
code.append('T');
}
break ;
case 'G' : // GH silent at end or before consonant
if (isLastChar(wdsz, n + 1) &&
isNextChar(local, n, 'H')) {
break;
}
if (!isLastChar(wdsz, n + 1) &&
isNextChar(local,n,'H') &&
!isVowel(local,n+2)) {
break;
}
if ((n > 0) &&
( regionMatch(local, n, "GN") ||
regionMatch(local, n, "GNED") ) ) {
break; // silent G
}
if (isPreviousChar(local, n, 'G')) {
// NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
hard = true ;
} else {
hard = false ;
}
if (!isLastChar(wdsz, n) &&
(FRONTV.indexOf(local.charAt(n + 1)) >= 0) &&
(!hard)) {
code.append('J');
} else {
code.append('K');
}
break ;
case 'H':
if (isLastChar(wdsz, n)) {
break ; // terminal H
}
if ((n > 0) &&
(VARSON.indexOf(local.charAt(n - 1)) >= 0)) {
break;
}
if (isVowel(local,n+1)) {
code.append('H'); // Hvowel
}
break;
case 'F':
case 'J' :
case 'L' :
case 'M':
case 'N' :
case 'R' :
code.append(symb);
break;
case 'K' :
if (n > 0) { // not initial
if (!isPreviousChar(local, n, 'C')) {
code.append(symb);
}
} else {
code.append(symb); // initial K
}
break ;
case 'P' :
if (isNextChar(local,n,'H')) {
// PH -> F
code.append('F');
} else {
code.append(symb);
}
break ;
case 'Q' :
code.append('K');
break;
case 'S' :
if (regionMatch(local,n,"SH") ||
regionMatch(local,n,"SIO") ||
regionMatch(local,n,"SIA")) {
code.append('X');
} else {
code.append('S');
}
break;
case 'T' :
if (regionMatch(local,n,"TIA") ||
regionMatch(local,n,"TIO")) {
code.append('X');
break;
}
if (regionMatch(local,n,"TCH")) {
// Silent if in "TCH"
break;
}
// substitute numeral 0 for TH (resembles theta after all)
if (regionMatch(local,n,"TH")) {
code.append('0');
} else {
code.append('T');
}
break ;
case 'V' :
code.append('F'); break ;
case 'W' : case 'Y' : // silent if not followed by vowel
if (!isLastChar(wdsz,n) &&
isVowel(local,n+1)) {
code.append(symb);
}
break ;
case 'X' :
code.append('K'); code.append('S');
break ;
case 'Z' :
code.append('S'); break ;
} // end switch
n++ ;
} // end else from symb != 'C'
if (code.length() > this.getMaxCodeLen()) {
code.setLength(this.getMaxCodeLen());
}
}
return code.toString();
}
private boolean isVowel(StringBuffer string, int index) {
return VOWELS.indexOf(string.charAt(index)) >= 0;
}
private boolean isPreviousChar(StringBuffer string, int index, char c) {
boolean matches = false;
if( index > 0 &&
index < string.length() ) {
matches = string.charAt(index - 1) == c;
}
return matches;
}
private boolean isNextChar(StringBuffer string, int index, char c) {
boolean matches = false;
if( index >= 0 &&
index < string.length() - 1 ) {
matches = string.charAt(index + 1) == c;
}
return matches;
}
private boolean regionMatch(StringBuffer string, int index, String test) {
boolean matches = false;
if( index >= 0 &&
(index + test.length() - 1) < string.length() ) {
String substring = string.substring( index, index + test.length());
matches = substring.equals( test );
}
return matches;
}
private boolean isLastChar(int wdsz, int n) {
return n + 1 == wdsz;
}
/**
* Encodes an Object using the metaphone algorithm. This method
* is provided in order to satisfy the requirements of the
* Encoder interface, and will throw an EncoderException if the
* supplied object is not of type java.lang.String.
*
* @param pObject Object to encode
* @return An object (or type java.lang.String) containing the
* metaphone code which corresponds to the String supplied.
* @throws EncoderException if the parameter supplied is not
* of type java.lang.String
*/
public Object encode(Object pObject) throws EncoderException {
if (!(pObject instanceof String)) {
throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
}
return metaphone((String) pObject);
}
/**
* Encodes a String using the Metaphone algorithm.
*
* @param pString String object to encode
* @return The metaphone code corresponding to the String supplied
*/
public String encode(String pString) {
return metaphone(pString);
}
/**
* Tests is the metaphones of two strings are identical.
*
* @param str1 First of two strings to compare
* @param str2 Second of two strings to compare
* @return <code>true</code> if the metaphones of these strings are identical,
* <code>false</code> otherwise.
*/
public boolean isMetaphoneEqual(String str1, String str2) {
return metaphone(str1).equals(metaphone(str2));
}
/**
* Returns the maxCodeLen.
* @return int
*/
public int getMaxCodeLen() { return this.maxCodeLen; }
/**
* Sets the maxCodeLen.
* @param maxCodeLen The maxCodeLen to set
*/
public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
}

@ -0,0 +1,73 @@
/**
* Phonetic
* Copyright 201 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 13.12.2011 at http://yacy.net
*
* $LastChangedDate$
* $LastChangedRevision$
* $LastChangedBy$
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.language.phonetic;
public class Phonetic {
public enum Encoder {
SOUNDEX("Soundex", ""),
COLONE_PHONETIC("Koelner Phonetik", "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik"),
METAPHONE("Metaphone", ""),
DOUBLE_METAPHONE("Double Metaphone", ""),
NONE("", "");
final String printName;
final String infoUrl;
private Encoder(final String printName, final String infoUrl) {
this.printName = printName;
this.infoUrl = infoUrl;
}
}
private static final Soundex soundexEncoder = new Soundex();
private static final Metaphone metaphoneEncoder = new Metaphone();
private static final DoubleMetaphone doubleMetaphoneEncoder = new DoubleMetaphone();
private static final ColognePhonetic colognePhonetic = new ColognePhonetic();
public static String encode(final Encoder encoder, final String s) {
try {
if (encoder == Encoder.SOUNDEX) return soundexEncoder.encode(s);
if (encoder == Encoder.COLONE_PHONETIC) return colognePhonetic.encode(s);
if (encoder == Encoder.METAPHONE) return metaphoneEncoder.encode(s);
if (encoder == Encoder.DOUBLE_METAPHONE) return doubleMetaphoneEncoder.encode(s);
return s;
} catch (Throwable e) {
// some encoders do not work with non-ASCII charachters and throw an exception
return s;
}
}
public static void main(String[] args) {
for (Encoder encoder: Encoder.values()) {
for (String s: args) {
System.out.print(Phonetic.encode(encoder, s));
System.out.print(" ");
}
System.out.println();
}
}
}

@ -0,0 +1,340 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.yacy.cora.language.phonetic;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;
/**
* Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
* general purpose scheme to find word with similar phonemes.
*
* @author Apache Software Foundation
* @version $Id: Soundex.java 1201529 2011-11-13 21:57:16Z ggregory $
*/
public class Soundex implements StringEncoder {
/**
* This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
* means do not encode.
* <p>
* (This constant is provided as both an implementation convenience and to allow Javadoc to pick
* up the value for the constant values page.)
* </p>
*
* @see #US_ENGLISH_MAPPING
*/
public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
/**
* This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position
* means do not encode.
*
* @see Soundex#Soundex(char[])
*/
private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
/**
* An instance of Soundex using the US_ENGLISH_MAPPING mapping.
*
* @see #US_ENGLISH_MAPPING
*/
public static final Soundex US_ENGLISH = new Soundex();
/**
* Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
* letter is mapped. This implementation contains a default map for US_ENGLISH
*/
private final char[] soundexMapping;
/**
* Creates an instance using US_ENGLISH_MAPPING
*
* @see Soundex#Soundex(char[])
* @see Soundex#US_ENGLISH_MAPPING
*/
public Soundex() {
this.soundexMapping = US_ENGLISH_MAPPING;
}
/**
* Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized
* mapping for a non-Western character set.
*
* Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
* letter is mapped. This implementation contains a default map for US_ENGLISH
*
* @param mapping
* Mapping array to use when finding the corresponding code for a given character
*/
public Soundex(char[] mapping) {
this.soundexMapping = new char[mapping.length];
System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
}
/**
* Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping,
* and/or possibly provide an internationalized mapping for a non-Western character set.
*
* @param mapping
* Mapping string to use when finding the corresponding code for a given character
* @since 1.4
*/
public Soundex(String mapping) {
this.soundexMapping = mapping.toCharArray();
}
/**
* Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This
* return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or
* identical values.
*
* @param s1
* A String that will be encoded and compared.
* @param s2
* A String that will be encoded and compared.
* @return The number of characters in the two encoded Strings that are the same from 0 to 4.
*
* @see SoundexUtils#difference(StringEncoder,String,String)
* @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS
* T-SQL DIFFERENCE </a>
*
* @throws EncoderException
* if an error occurs encoding one of the strings
* @since 1.3
*/
public int difference(String s1, String s2) throws EncoderException {
return difference(this, s1, s2);
}
/**
* Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of
* the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.
*
* @param pObject
* Object to encode
* @return An object (or type java.lang.String) containing the soundex code which corresponds to the String
* supplied.
* @throws EncoderException
* if the parameter supplied is not of type java.lang.String
* @throws IllegalArgumentException
* if a character is not mapped
*/
public Object encode(Object pObject) throws EncoderException {
if (!(pObject instanceof String)) {
throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
}
return soundex((String) pObject);
}
/**
* Encodes a String using the soundex algorithm.
*
* @param pString
* A String object to encode
* @return A Soundex code corresponding to the String supplied
* @throws IllegalArgumentException
* if a character is not mapped
*/
public String encode(String pString) {
return soundex(pString);
}
/**
* Used internally by the SoundEx algorithm.
*
* Consonants from the same code group separated by W or H are treated as one.
*
* @param str
* the cleaned working string to encode (in upper case).
* @param index
* the character position to encode
* @return Mapping code for a particular character
* @throws IllegalArgumentException
* if the character is not mapped
*/
private char getMappingCode(String str, int index) {
// map() throws IllegalArgumentException
char mappedChar = this.map(str.charAt(index));
// HW rule check
if (index > 1 && mappedChar != '0') {
char hwChar = str.charAt(index - 1);
if ('H' == hwChar || 'W' == hwChar) {
char preHWChar = str.charAt(index - 2);
char firstCode = this.map(preHWChar);
if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) {
return 0;
}
}
}
return mappedChar;
}
/**
* Returns the soundex mapping.
*
* @return soundexMapping.
*/
private char[] getSoundexMapping() {
return this.soundexMapping;
}
/**
* Maps the given upper-case character to its Soundex code.
*
* @param ch
* An upper-case character.
* @return A Soundex code.
* @throws IllegalArgumentException
* Thrown if <code>ch</code> is not mapped.
*/
private char map(char ch) {
int index = ch - 'A';
if (index < 0 || index >= this.getSoundexMapping().length) {
throw new IllegalArgumentException("The character is not mapped: " + ch);
}
return this.getSoundexMapping()[index];
}
/**
* Retrieves the Soundex code for a given String object.
*
* @param str
* String to encode using the Soundex algorithm
* @return A soundex code for the String supplied
* @throws IllegalArgumentException
* if a character is not mapped
*/
public String soundex(String str) {
if (str == null) {
return null;
}
str = clean(str);
if (str.length() == 0) {
return str;
}
char out[] = {'0', '0', '0', '0'};
char last, mapped;
int incount = 1, count = 1;
out[0] = str.charAt(0);
// getMappingCode() throws IllegalArgumentException
last = getMappingCode(str, 0);
while ((incount < str.length()) && (count < out.length)) {
mapped = getMappingCode(str, incount++);
if (mapped != 0) {
if ((mapped != '0') && (mapped != last)) {
out[count++] = mapped;
}
last = mapped;
}
}
return new String(out);
}
/**
* Cleans up the input string before Soundex processing by only returning
* upper case letters.
*
* @param str
* The String to clean.
* @return A clean String.
*/
static String clean(String str) {
if (str == null || str.length() == 0) {
return str;
}
int len = str.length();
char[] chars = new char[len];
int count = 0;
for (int i = 0; i < len; i++) {
if (Character.isLetter(str.charAt(i))) {
chars[count++] = str.charAt(i);
}
}
if (count == len) {
return str.toUpperCase(java.util.Locale.ENGLISH);
}
return new String(chars, 0, count).toUpperCase(java.util.Locale.ENGLISH);
}
/**
* Encodes the Strings and returns the number of characters in the two
* encoded Strings that are the same.
* <ul>
* <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
* little or no similarity, and 4 indicates strong similarity or identical
* values.</li>
* <li>For refined Soundex, the return value can be greater than 4.</li>
* </ul>
*
* @param encoder
* The encoder to use to encode the Strings.
* @param s1
* A String that will be encoded and compared.
* @param s2
* A String that will be encoded and compared.
* @return The number of characters in the two Soundex encoded Strings that
* are the same.
*
* @see #differenceEncoded(String,String)
* @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
* MS T-SQL DIFFERENCE</a>
*
* @throws EncoderException
* if an error occurs encoding one of the strings
*/
static int difference(StringEncoder encoder, String s1, String s2) throws EncoderException {
return differenceEncoded(encoder.encode(s1), encoder.encode(s2));
}
/**
* Returns the number of characters in the two Soundex encoded Strings that
* are the same.
* <ul>
* <li>For Soundex, this return value ranges from 0 through 4: 0 indicates
* little or no similarity, and 4 indicates strong similarity or identical
* values.</li>
* <li>For refined Soundex, the return value can be greater than 4.</li>
* </ul>
*
* @param es1
* An encoded String.
* @param es2
* An encoded String.
* @return The number of characters in the two Soundex encoded Strings that
* are the same.
*
* @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
* MS T-SQL DIFFERENCE</a>
*/
static int differenceEncoded(String es1, String es2) {
if (es1 == null || es2 == null) {
return 0;
}
int lengthToMatch = Math.min(es1.length(), es2.length());
int diff = 0;
for (int i = 0; i < lengthToMatch; i++) {
if (es1.charAt(i) == es2.charAt(i)) {
diff++;
}
}
return diff;
}
}

@ -0,0 +1,74 @@
/**
* Literal
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 18.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
/**
* A literal is the possible value for a predicate.
* A set of literals is the norm of a predicate.
* Each literal can have an attached explanation which we express
* as a link to the resource that explains the literal.
*/
public interface Literal {
/**
* the terminal is the actual content of the property and also
* the visual representation of the content of a property if the
* literal is assigned to that property.
* @return a string representing the literal
*/
public String getTerminal();
/**
* the subject of a literal is a reference to a resource that
* explains the literal. If an object has attached properties
* from different vocabularies and properties assigned to the
* object have actual literal instances assigned, then the set
* of subjects of these literals explain the object as a co-notation
* to knowledge. Subjects of literals shall therefore be
* knowledge authorities for the predicates where the literal is
* assigned.
* @return an url to a knowledge authority for the literal
*/
public MultiProtocolURI getSubject();
/**
* if a resource is poorly annotated with metadata an it shall
* be automatically annotated, then the terminal of a literal
* may be too weak to discover literals in the resource. An additional
* discovery pattern may help to reduce the set of literals that can
* be discovered automatically. A discovery pattern is then not
* a replacement of the terminal itself, it is an additional pattern
* that must occur also in the resource where also the terminal of
* the literal appears. If the terminal itself is sufficient to discover
* the literal, then the discovery pattern may be a catch-all '.*' pattern.
* @return the discovery pattern to identify the literal in the resource.
*/
public Pattern getDiscoveryPattern();
}

@ -0,0 +1,113 @@
/**
* AbstractScoreMap
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod;
import java.util.HashMap;
import java.util.Map;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.vocabulary.Rdf;
/**
* class for a RDF node element. For a short primer see
* http://www.w3.org/TR/REC-rdf-syntax/
*/
public class Node extends HashMap<String, byte[]> implements Map<String, byte[]> {
private static final long serialVersionUID = -6715118942251224832L;
public static final String SUBJECT = "rdf:about";
private final Rdf type;
public Node(Rdf type) {
super();
this.type = type;
}
public Node(Rdf type, byte[] subject) {
this(type);
this.put(SUBJECT, subject);
}
/**
* initialize the triples.
* one of the properties must be the resource SUBJECT
* for a blank node the SUBJECT can be omitted
* @param set
*/
public Node(Rdf type, Map<String, byte[]> set) {
this(type);
this.putAll(set);
}
public Rdf getType() {
return this.type;
}
public boolean isBlank() {
return !this.containsKey(SUBJECT);
}
public byte[] getSubject() {
return this.get(SUBJECT);
}
public void setSubject(byte[] subject) {
this.put(SUBJECT, subject);
}
public byte[] getObject(Vocabulary predicate) {
return this.get(predicate.getPredicate());
}
public byte[] setObject(Vocabulary predicate, byte[] object) {
return this.put(predicate.getPredicate(), object);
}
public byte[] removePredicate(Vocabulary predicate) {
return this.remove(predicate.getPredicate());
}
public byte[] toObject() {
StringBuilder sb = new StringBuilder(this.size() * 50);
sb.append("<");
sb.append(this.type.getPredicate());
byte[] subject = this.get(SUBJECT);
if (subject != null) sb.append(" rdf:about=\"").append(UTF8.String(subject)).append('\"');
sb.append(">\n");
for (Map.Entry<String, byte[]> entry: this.entrySet()) {
if (entry.getKey().equals(SUBJECT)) continue;
sb.append('<').append(entry.getKey()).append('>');
sb.append(UTF8.String(entry.getValue()));
sb.append("</").append(entry.getKey()).append(">\n");
}
sb.append("</");
sb.append(this.type.getPredicate());
sb.append(">\n");
return UTF8.getBytes(sb);
}
}

@ -0,0 +1,78 @@
/**
* Syntax
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 17.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod;
import java.util.HashMap;
import java.util.Map;
import net.yacy.cora.lod.vocabulary.CreativeCommons;
import net.yacy.cora.lod.vocabulary.DublinCore;
import net.yacy.cora.lod.vocabulary.Foaf;
import net.yacy.cora.lod.vocabulary.Geo;
import net.yacy.cora.lod.vocabulary.HttpHeader;
import net.yacy.cora.lod.vocabulary.Rdf;
import net.yacy.cora.lod.vocabulary.YaCyMetadata;
/**
* helper class to understand xml tags and vocabularies
*/
public class Syntax {
private final static Class<?>[] vocabularies = new Class<?>[]{
CreativeCommons.class,
DublinCore.class,
Foaf.class,
Geo.class,
HttpHeader.class,
Rdf.class,
YaCyMetadata.class
};
private final static Map<String, Vocabulary> tagMap = new HashMap<String, Vocabulary>();
static {
Vocabulary voc;
for (Class<?> v: vocabularies) {
Object[] cs = v.getEnumConstants();
for (Object c: cs) {
voc = (Vocabulary) c;
tagMap.put(voc.getPredicate(), voc);
}
}
}
/**
* recognizer for vocabulary tag names
* @param tag
* @return the vocabulary object for the given tag
*/
public static Vocabulary getVocabulary(String tag) {
return tagMap.get(tag);
}
public static void main(String[] args) {
System.out.println(tagMap);
}
}

@ -0,0 +1,124 @@
/**
* TripleStore
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod;
import java.util.AbstractMap;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.lod.vocabulary.Rdf;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.storage.MapStore;
public class TripleStore {
MapStore store;
public TripleStore(MapStore store) {
this.store = store;
}
public void clear() {
this.store.clear();
}
public boolean contains(byte[] id) {
return this.store.containsKey(id);
}
public Node get(byte[] id) {
Map<String, byte[]> n = this.store.get(id);
if (n == null) return null;
return new Node(Rdf.Description, n);
}
public boolean isEmpty() {
return this.store.isEmpty();
}
public Node put(byte[] id, Node node) {
Map<String, byte[]> n = this.store.put(id, node);
if (n == null) return null;
return new Node(Rdf.Description, n);
}
public void putAll(TripleStore entries) {
Iterator<Map.Entry<byte[], Node>> i = entries.iterator();
Map.Entry<? extends byte[], ? extends Node> entry;
while (i.hasNext()) {
entry = i.next();
this.put(entry.getKey(), entry.getValue());
}
}
public Node remove(byte[] id) {
Map<String, byte[]> n = this.store.remove(id);
if (n == null) return null;
return new Node(Rdf.Description, n);
}
public int size() {
return this.store.size();
}
public Iterator<java.util.Map.Entry<byte[], Node>> iterator() {
final Iterator<byte[]> id = this.idIterator();
return new Iterator<Map.Entry<byte[], Node>>(){
@Override
public boolean hasNext() {
return id.hasNext();
}
@Override
public Map.Entry<byte[], Node> next() {
byte[] key = id.next();
if (key == null) return null;
return new AbstractMap.SimpleImmutableEntry<byte[], Node>(key, TripleStore.this.get(key));
}
@Override
public void remove() {
id.remove();
}
};
}
public ByteOrder getOrdering() {
return this.store.getOrdering();
}
public CloneableIterator<byte[]> idIterator() {
return this.store.keyIterator();
}
public void close() {
this.store.close();
}
}

@ -0,0 +1,72 @@
/**
* Vocabulary
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod;
import java.util.Set;
/*
* A Vocabulary is an interface to an 'extensible enum pattern'.
* We want to have an kind of extensible enum for vocabularies.
* Since enum classes cannot be extended we use a hack as explained in
* http://blogs.oracle.com/darcy/entry/enums_and_mixins .
* For an example for 'extensible enum pattern' see
* http://stackoverflow.com/questions/1414755/java-extend-enum
*/
public interface Vocabulary {
/**
* get the RDF identifier
* @return
*/
public String getIdentifier();
/**
* get the prefix for the predicates of this vocabulary
* @return
*/
public String getPrefix();
/**
* get the predicate name which already contains the prefix and the ':'
* @return
*/
public String getPredicate();
/**
* get a set of literals that are allowed for the predicate as values
* @return
*/
public Set<Literal> getLiterals();
/**
* the name method is identical to the java.lang.Enum method.
* If an Enum class for vocabularies
* implements this interface, the name() method is automatically implemented
*
* @return Returns the name of the enum constant as declared in the enum declaration.
*/
public String name();
}

@ -0,0 +1,207 @@
/**
* CreativeCommons
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 17.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod.vocabulary;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.lod.Literal;
import net.yacy.cora.lod.Vocabulary;
/**
* a vocabulary for creative commons license declarations. see:
* http://creativecommons.org/ns#
*/
public enum CreativeCommons implements Vocabulary {
// License Properties
permits(new Literal[]{
PermitLiteral.Reproduction,
PermitLiteral.Distribution,
PermitLiteral.DerivativeWorks,
PermitLiteral.Sharing}),
requires(new Literal[]{
RequirementLiteral.Notice,
RequirementLiteral.Attribution,
RequirementLiteral.ShareAlike,
RequirementLiteral.SourceCode,
RequirementLiteral.Copyleft,
RequirementLiteral.LesserCopyleft}),
prohibits(new Literal[]{
ProhibitionLiteral.CommercialUse,
ProhibitionLiteral.HighIncomeNationUse}),
jurisdiction,
legalcode,
deprecatedOn,
// Work Properties
license,
morePermissions,
attributionName,
attributionURL,
useGuidelines;
enum PermitLiteral implements Literal {
Reproduction("Reproduction", "http://creativecommons.org/ns#Permission", ".*"),
Distribution("Distribution", "http://creativecommons.org/ns#Permission", ".*"),
DerivativeWorks("Derivative Works", "http://creativecommons.org/ns#Permission", ".*"),
Sharing("Sharing", "http://creativecommons.org/ns#Permission", ".*");
String terminal;
MultiProtocolURI subject;
Pattern discoveryPattern;
private PermitLiteral(
String terminal,
String subject,
String discoveryPattern) {
this.terminal = terminal;
try {
this.subject = subject == null ? null : new MultiProtocolURI(subject);
} catch (MalformedURLException e) {
this.subject = null;
}
this.discoveryPattern = Pattern.compile(discoveryPattern == null ? ".*" : discoveryPattern);
}
@Override
public String getTerminal() { return this.terminal; }
@Override
public MultiProtocolURI getSubject() { return this.subject; }
@Override
public Pattern getDiscoveryPattern() { return this.discoveryPattern; }
}
enum RequirementLiteral implements Literal {
Notice("Notice", "http://creativecommons.org/ns#Requirement", ".*"),
Attribution("Attribution", "http://creativecommons.org/ns#Requirement", ".*"),
ShareAlike("Share Alike", "http://creativecommons.org/ns#Requirement", ".*"),
SourceCode("Source Code", "http://creativecommons.org/ns#Requirement", ".*"),
Copyleft("Copyleft", "http://creativecommons.org/ns#Requirement", ".*"),
LesserCopyleft("Lesser Copyleft", "http://creativecommons.org/ns#Requirement", ".*");
String terminal;
MultiProtocolURI subject;
Pattern discoveryPattern;
private RequirementLiteral(
String terminal,
String subject,
String discoveryPattern) {
this.terminal = terminal;
try {
this.subject = subject == null ? null : new MultiProtocolURI(subject);
} catch (MalformedURLException e) {
this.subject = null;
}
this.discoveryPattern = Pattern.compile(discoveryPattern == null ? ".*" : discoveryPattern);
}
@Override
public String getTerminal() { return this.terminal; }
@Override
public MultiProtocolURI getSubject() { return this.subject; }
@Override
public Pattern getDiscoveryPattern() { return this.discoveryPattern; }
}
enum ProhibitionLiteral implements Literal {
CommercialUse("Commercial Use", "http://creativecommons.org/ns#Prohibition", ".*"),
HighIncomeNationUse("High Income Nation Use", "http://creativecommons.org/ns#Prohibition", ".*");
String terminal;
MultiProtocolURI subject;
Pattern discoveryPattern;
private ProhibitionLiteral(
String terminal,
String subject,
String discoveryPattern) {
this.terminal = terminal;
try {
this.subject = subject == null ? null : new MultiProtocolURI(subject);
} catch (MalformedURLException e) {
this.subject = null;
}
this.discoveryPattern = Pattern.compile(discoveryPattern == null ? ".*" : discoveryPattern);
}
@Override
public String getTerminal() { return this.terminal; }
@Override
public MultiProtocolURI getSubject() { return this.subject; }
@Override
public Pattern getDiscoveryPattern() { return this.discoveryPattern; }
}
public final static String IDENTIFIER = "http://creativecommons.org/ns#";
public final static String PREFIX = "cc";
private final String predicate;
private final Set<Literal> literals;
private CreativeCommons() {
this.predicate = PREFIX + ":" + this.name();
this.literals = null;
}
private CreativeCommons(Literal[] literals) {
this.predicate = PREFIX + ":" + this.name();
this.literals = new HashSet<Literal>();
for (Literal l: literals) this.literals.add(l);
}
@Override
public String getIdentifier() {
return IDENTIFIER;
}
@Override
public String getPrefix() {
return PREFIX;
}
@Override
public Set<Literal> getLiterals() {
return null;
}
@Override
public String getPredicate() {
return this.predicate;
}
}

@ -0,0 +1,79 @@
/**
* DublinCore
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod.vocabulary;
import java.util.Set;
import net.yacy.cora.lod.Literal;
import net.yacy.cora.lod.Vocabulary;
public enum DublinCore implements Vocabulary {
Contributor,
Coverage,
Creator,
Date,
Description,
Format,
Identifier,
Language,
Publisher,
Relation,
Rights,
Source,
Subject,
Title,
Type;
public final static String IDENTIFIER = "http://dublincore.org/documents/2010/10/11/dces/";
public final static String PREFIX = "dc";
private final String predicate;
private DublinCore() {
this.predicate = PREFIX + ":" + this.name().toLowerCase();
}
@Override
public String getIdentifier() {
return IDENTIFIER;
}
@Override
public String getPrefix() {
return PREFIX;
}
@Override
public Set<Literal> getLiterals() {
return null;
}
@Override
public String getPredicate() {
return this.predicate;
}
}

@ -0,0 +1,62 @@
/**
* Foaf
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 17.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod.vocabulary;
import java.util.Set;
import net.yacy.cora.lod.Literal;
import net.yacy.cora.lod.Vocabulary;
/**
* The friend of a friend vocabulary. see:
* http://xmlns.com/foaf/spec/
*/
public enum Foaf implements Vocabulary {
;
@Override
public String getIdentifier() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getPrefix() {
// TODO Auto-generated method stub
return null;
}
@Override
public Set<Literal> getLiterals() {
return null;
}
@Override
public String getPredicate() {
// TODO Auto-generated method stub
return null;
}
}

@ -0,0 +1,65 @@
/**
* Geo
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod.vocabulary;
import java.util.Set;
import net.yacy.cora.lod.Literal;
import net.yacy.cora.lod.Vocabulary;
public enum Geo implements Vocabulary {
Long,
Lat;
public final static String IDENTIFIER = "http://www.w3.org/2003/01/geo/wgs84_pos#";
public final static String PREFIX = "geo";
private final String predicate;
private Geo() {
this.predicate = PREFIX + ":" + this.name().toLowerCase();
}
@Override
public String getIdentifier() {
return IDENTIFIER;
}
@Override
public String getPrefix() {
return PREFIX;
}
@Override
public Set<Literal> getLiterals() {
return null;
}
@Override
public String getPredicate() {
return this.predicate;
}
}

@ -0,0 +1,117 @@
/**
* HttpHeader
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod.vocabulary;
import java.util.Set;
import net.yacy.cora.lod.Literal;
import net.yacy.cora.lod.Vocabulary;
public enum HttpHeader implements Vocabulary {
//The following properties may appear in nodes of type Request:
accept, // representing an Accept header,
acceptCharset, // representing an Accept-Charset header,
acceptEncoding, // representing an Accept-Encoding header,
acceptLanguage, // representing an Accept-Language header,
authorization, // representing an Authorization header,
expect, // representing an Expect header,
from, // representing a From header,
host, // representing a Host header,
ifMatch, // representing an If-Match header,
ifModifiedSince, // representing an If-Modified-Since header,
ifNoneMatch, // representing an If-None-Match header,
ifRange, // representing an If-Range header,
ifUnmodifiedSince, // representing an If-Unmodified-Since header,
maxForwards, // representing a Max-Forwards header,
proxyAuthorization, // representing a Proxy-Authorization header,
range, // representing a Range header,
referer, // representing a Referer header,
te, // representing a TE header,
userAgent, // representing a User-Agent header.
//The following properties may appear in nodes of type Response:
acceptRanges, // representing a Accept-Ranges header,
age, // representing an Age header,
etag, // representing an ETag header,
location, // representing a Location header,
proxyAuthenticate, // representing a Proxy-Authenticate header,
retryAfter, // representing a Retry-After header,
server, // representing a Server header,
vary, // representing a Vary header,
wwwAuthenticate, // representing a WWW-Authenticate header.
//The following properties may appear in nodes of type Request or Response:
allow, // representing an Allow header,
cacheControl, // representing a Cache-Control header,
connection, // representing a Connection header,
contentEncoding, // representing a Content-Encoding header,
contentLanguage, // representing a Content-Language header,
contentLength, // representing a Content-Length header,
contentLocation, // representing a Content-Location header,
contentMD5, // representing a Content-MD5 header,
contentRange, // representing a Content-Range header,
contentType, // representing a Content-Type header,
date, // representing a Date header,
expires, // representing an Expires header,
lastModified, // representing a Last-Modified header,
pragma, // representing a Pragma header,
trailer, // representing a Trailer header,
transferEncoding, // representing a Transfer-Encoding header,
upgrade, // representing an Upgrade header,
via, // representing a Via header,
warning; // representing a Warning header.
public final static String IDENTIFIER = "http://www.w3.org/WAI/ER/HTTP/WD-HTTP-in-RDF-20060131";
public final static String PREFIX = "http";
private final String predicate;
private HttpHeader() {
this.predicate = PREFIX + ":" + this.name();
}
@Override
public String getIdentifier() {
return IDENTIFIER;
}
@Override
public String getPrefix() {
return PREFIX;
}
@Override
public Set<Literal> getLiterals() {
return null;
}
@Override
public String getPredicate() {
return this.predicate;
}
}

@ -0,0 +1,69 @@
/**
* Rdf
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 17.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod.vocabulary;
import java.util.Set;
import net.yacy.cora.lod.Literal;
import net.yacy.cora.lod.Vocabulary;
public enum Rdf implements Vocabulary {
RDF,
Description,
Bag,
Seq,
Alt;
public final static String IDENTIFIER = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
public final static String PREFIX = "rdf";
private final String predicate;
private Rdf() {
this.predicate = PREFIX + ":" + this.name();
}
@Override
public String getIdentifier() {
return IDENTIFIER;
}
@Override
public String getPrefix() {
return PREFIX;
}
@Override
public Set<Literal> getLiterals() {
return null;
}
@Override
public String getPredicate() {
return this.predicate;
}
}

@ -0,0 +1,105 @@
/**
* YaCyMetadata
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod.vocabulary;
import java.util.Set;
import net.yacy.cora.lod.Literal;
import net.yacy.cora.lod.Vocabulary;
/**
* this is the vocabulary of the 'classic' YaCy metadata database
*/
public enum YaCyMetadata implements Vocabulary {
hash, // the url's hash
mod, // last-modified from the httpd
load, // time when the url was loaded
fresh, // time until this url is fresh
referrer, // (one of) the url's referrer hash(es)
md5, // the md5 of the url content (to identify changes)
size, // size of file in bytes
wc, // size of file by number of words; for video and audio: seconds
dt, // doctype, taken from extension or any other heuristic
flags, // flags; any stuff (see Word-Entity definition)
lang, // language
llocal, // # of outlinks to same domain; for video and image: width
lother, // # of outlinks to outside domain; for video and image: height
limage, // # of embedded image links
laudio, // # of embedded audio links; for audio: track number; for video: number of audio tracks
lvideo, // # of embedded video links
lapp; // # of embedded links to applications
/*
"String hash-12, " + // the url's hash
"Cardinal mod-4 {b256}, " + // last-modified from the httpd
"Cardinal load-4 {b256}, " + // time when the url was loaded
"Cardinal fresh-4 {b256}, " + // time until this url is fresh
"String referrer-12, " + // (one of) the url's referrer hash(es)
"byte[] md5-8, " + // the md5 of the url content (to identify changes)
"Cardinal size-6 {b256}, " + // size of file in bytes
"Cardinal wc-3 {b256}, " + // size of file by number of words; for video and audio: seconds
"byte[] dt-1, " + // doctype, taken from extension or any other heuristic
"Bitfield flags-4, " + // flags; any stuff (see Word-Entity definition)
"String lang-2, " + // language
"Cardinal llocal-2 {b256}, " + // # of outlinks to same domain; for video and image: width
"Cardinal lother-2 {b256}, " + // # of outlinks to outside domain; for video and image: height
"Cardinal limage-2 {b256}, " + // # of embedded image links
"Cardinal laudio-2 {b256}, " + // # of embedded audio links; for audio: track number; for video: number of audio tracks
"Cardinal lvideo-2 {b256}, " + // # of embedded video links
"Cardinal lapp-2 {b256}", // # of embedded links to applications
*/
public final static String IDENTIFIER = "http://yacy.net/vocabularies/yacymetadata#";
public final static String PREFIX = "yacy";
private final String predicate;
private YaCyMetadata() {
this.predicate = PREFIX + ":" + this.name();
}
@Override
public String getIdentifier() {
return IDENTIFIER;
}
@Override
public String getPrefix() {
return PREFIX;
}
@Override
public Set<Literal> getLiterals() {
return null;
}
@Override
public String getPredicate() {
return this.predicate;
}
}

@ -22,7 +22,8 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.order;
public abstract class AbstractOrder<A> implements Order<A> {

@ -0,0 +1,52 @@
/**
* ByteOrder
* (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 10.01.2008 on http://yacy.net
*
* $LastChangedDate$
* $LastChangedRevision$
* $LastChangedBy$
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.order;
public interface ByteOrder extends Order<byte[]> {
@Override
public boolean wellformed(byte[] a);
public boolean wellformed(byte[] a, int start, int len);
@Override
public int compare(byte[] a, byte[] b);
public int compare(byte[] a, byte[] b, int len);
public int compare(byte[] a, int astart, byte[] b, int bstart, int len);
@Override
public boolean equal(final byte[] a, final byte[] b);
public boolean equal(final byte[] a, int astart, final byte[] b, int bstart, int length);
public long cardinal(final byte[] a, int off, int len);
public byte[] smallest(byte[] a, byte[] b);
public byte[] largest(byte[] a, byte[] b);
}

@ -0,0 +1,35 @@
/**
* ByteOrder
* (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 10.01.2008 on http://yacy.net
*
* $LastChangedDate$
* $LastChangedRevision$
* $LastChangedBy$
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.order;
import java.util.Iterator;
public interface CloneableIterator<E> extends Iterator<E> {
// clone the iterator using a modifier
// the modifier can be i.e. a re-start position
public CloneableIterator<E> clone(Object modifier);
}

@ -0,0 +1,84 @@
/**
* ByteOrder
* (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 25.04.2007 on http://yacy.net
*
* $LastChangedDate$
* $LastChangedRevision$
* $LastChangedBy$
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.order;
import java.util.Iterator;
import java.util.TreeMap;
public class CloneableMapIterator<E> implements CloneableIterator<E> {
TreeMap<E, ?> map;
E next, last;
Object start;
Iterator<E> iter;
public CloneableMapIterator(final TreeMap<E, ?> map, final E start) {
// map must contain eiter a byte[]/Object or a String/Object mapping.
// start must be either of type byte[] or String
// this iterator iterates then only the key elements of the map
this.map = map;
this.start = start;
this.iter = map.keySet().iterator();
if (this.start == null) {
if (this.iter.hasNext()) this.next = this.iter.next(); else this.next = null;
} else while (this.iter.hasNext()) {
this.next = this.iter.next();
if (map.comparator().compare(this.next, start) > 1) break;
}
this.last = null;
}
@Override
@SuppressWarnings("unchecked")
public CloneableMapIterator<E> clone(final Object modifier) {
return new CloneableMapIterator<E>(this.map, (E) modifier);
}
@Override
public boolean hasNext() {
return this.next != null;
}
@Override
public E next() {
// returns key-elements, not entry-elements
this.last = this.next;
if (this.iter.hasNext()) {
this.next = this.iter.next();
} else {
this.next = null;
}
return this.last;
}
@Override
public void remove() {
this.map.remove(this.last);
}
}

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.order;
import java.util.Comparator;

@ -0,0 +1,69 @@
/**
* RatingOrder.java
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 25.08.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.order;
import net.yacy.cora.sorting.Rating;
public class RatingOrder<A> extends AbstractOrder<Rating<A>> implements Order<Rating<A>> {
Order<A> ordering;
public RatingOrder(final Order<A> ordering) {
this.ordering = ordering;
}
@Override
public int compare(final Rating<A> a, final Rating<A> b) {
return this.ordering.compare(a.getObject(), b.getObject());
}
@Override
public boolean wellformed(final Rating<A> a) {
return true;
}
@Override
public String signature() {
return "RA";
}
@Override
public long cardinal(final Rating<A> key) {
return key.getScore();
}
@Override
public boolean equal(final Rating<A> a, final Rating<A> b) {
return this.ordering.compare(a.getObject(), b.getObject()) == 1;
}
@Override
public Order<Rating<A>> clone() {
return this;
}
}

@ -1,3 +1,27 @@
/**
* ClassProvider
* Copyright 201 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 13.12.2011 at http://yacy.net
*
* $LastChangedDate$
* $LastChangedRevision$
* $LastChangedBy$
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.plugin;
import java.io.File;

@ -1,66 +0,0 @@
// RatingOrder.java
// -----------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://yacy.net
// Frankfurt, Germany, 2011
// created 25.08.2011
//
// $LastChangedDate: 2011-03-08 02:51:51 +0100 (Di, 08 Mrz 2011) $
// $LastChangedRevision: 7567 $
// $LastChangedBy: low012 $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.cora.ranking;
public class RatingOrder<A> extends AbstractOrder<Rating<A>> implements Order<Rating<A>> {
Order<A> ordering;
public RatingOrder(final Order<A> ordering) {
this.ordering = ordering;
}
public int compare(final Rating<A> a, final Rating<A> b) {
return this.ordering.compare(a.getObject(), b.getObject());
}
@Override
public boolean wellformed(final Rating<A> a) {
return true;
}
@Override
public String signature() {
return "RA";
}
@Override
public long cardinal(final Rating<A> key) {
return key.getScore();
}
@Override
public boolean equal(final Rating<A> a, final Rating<A> b) {
return this.ordering.compare(a.getObject(), b.getObject()) == 1;
}
@Override
public Order<Rating<A>> clone() {
return this;
}
}

@ -32,10 +32,10 @@ import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.ranking.ConcurrentScoreMap;
import net.yacy.cora.ranking.ScoreMap;
import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
public class SearchHub {

@ -26,7 +26,7 @@
package net.yacy.cora.services.federated;
import net.yacy.cora.ranking.WeakPriorityBlockingQueue;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
public class SearchResult extends WeakPriorityBlockingQueue<Object> {

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.sorting;
import java.util.ArrayList;
import java.util.List;

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.sorting;
import java.text.ParseException;
import java.text.SimpleDateFormat;
@ -58,10 +58,12 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
this.encnt = 0;
}
@Override
public Iterator<E> iterator() {
return this.map.keySet().iterator();
}
@Override
public synchronized void clear() {
this.map.clear();
this.pam.clear();
@ -73,6 +75,7 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
* shrink the cluster to a demanded size
* @param maxsize
*/
@Override
public void shrinkToMaxSize(final int maxsize) {
if (maxsize < 0) return;
Long key;
@ -90,6 +93,7 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
* shrink the cluster in such a way that the smallest score is equal or greater than a given minScore
* @param minScore
*/
@Override
public void shrinkToMinScore(final int minScore) {
int score;
Long key;
@ -121,7 +125,7 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
if (o instanceof Long) {
final long l = ((Long) o).longValue();
if (l < Integer.MAX_VALUE) return (int) l;
o = ((Long) o).toString();
return (int) (l & Integer.MAX_VALUE);
}
if (o instanceof Float) {
final double d = 1000f * ((Float) o).floatValue();
@ -148,13 +152,13 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
l = Long.parseLong(s);
}
// fix out-of-ranges
if (l > Integer.MAX_VALUE) return Integer.MAX_VALUE; //(int) (l & (Integer.MAX_VALUE));
if (l > Integer.MAX_VALUE) return (int) (l & Integer.MAX_VALUE);
if (l < 0) {
System.out.println("string2score: negative score for input " + s);
return 0;
}
return (int) l;
} catch (final Exception e) {
} catch (final Throwable e) {
// try it lex
int len = s.length();
if (len > 5) len = 5;
@ -188,26 +192,32 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
return this.gcount;
}
@Override
public synchronized int size() {
return this.map.size();
}
@Override
public boolean sizeSmaller(final int size) {
return this.map.size() < size;
}
@Override
public synchronized boolean isEmpty() {
return this.map.isEmpty();
}
@Override
public synchronized void inc(final E obj) {
inc(obj, 1);
}
@Override
public synchronized void dec(final E obj) {
inc(obj, -1);
}
@Override
public void set(final E obj, final int newScore) {
if (obj == null) return;
synchronized (this) {
@ -242,6 +252,7 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
this.gcount += newScore;
}
@Override
public void inc(final E obj, final int incrementScore) {
if (obj == null) return;
synchronized (this) {
@ -277,10 +288,12 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
this.gcount += incrementScore;
}
@Override
public void dec(final E obj, final int incrementScore) {
inc(obj, -incrementScore);
}
@Override
public int delete(final E obj) {
// deletes entry and returns previous score
if (obj == null) return 0;
@ -302,10 +315,12 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
return oldScore;
}
@Override
public synchronized boolean containsKey(final E obj) {
return this.map.containsKey(obj);
}
@Override
public int get(final E obj) {
if (obj == null) return 0;
final Long cs;
@ -316,30 +331,36 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
return (int) ((cs.longValue() & 0xFFFFFFFF00000000L) >> 32);
}
@Override
public synchronized int getMaxScore() {
if (this.map.isEmpty()) return -1;
return (int) ((this.pam.lastKey().longValue() & 0xFFFFFFFF00000000L) >> 32);
}
@Override
public synchronized int getMinScore() {
if (this.map.isEmpty()) return -1;
return (int) ((this.pam.firstKey().longValue() & 0xFFFFFFFF00000000L) >> 32);
}
@Override
public synchronized E getMaxKey() {
if (this.map.isEmpty()) return null;
return this.pam.get(this.pam.lastKey());
}
@Override
public synchronized E getMinKey() {
if (this.map.isEmpty()) return null;
return this.pam.get(this.pam.firstKey());
}
@Override
public String toString() {
return this.map + " / " + this.pam;
}
@Override
public synchronized Iterator<E> keys(final boolean up) {
if (up) return new simpleScoreIterator<E>();
return new reverseScoreIterator<E>();
@ -354,10 +375,12 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
this.view = ClusteredScoreMap.this.pam;
}
@Override
public boolean hasNext() {
return !this.view.isEmpty();
}
@Override
public E next() {
this.key = this.view.lastKey();
this.view = this.view.headMap(this.key);
@ -366,6 +389,7 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
return value;
}
@Override
public void remove() {
final Object val = ClusteredScoreMap.this.pam.remove(this.key);
if (val != null) ClusteredScoreMap.this.map.remove(val);
@ -382,16 +406,19 @@ public final class ClusteredScoreMap<E> extends AbstractScoreMap<E> implements R
this.ii = ClusteredScoreMap.this.pam.entrySet().iterator();
}
@Override
public boolean hasNext() {
return this.ii.hasNext();
}
@Override
public E next() {
this.entry = this.ii.next();
//System.out.println("cluster simple iterator: score = " + ((((Long) entry.getKey()).longValue() & 0xFFFFFFFF00000000L) >> 32) + ", handle = " + (((Long) entry.getKey()).longValue() & 0xFFFFFFFFL) + ", value = " + entry.getValue());
return this.entry.getValue();
}
@Override
public void remove() {
this.ii.remove();
if (this.entry.getValue() != null) ClusteredScoreMap.this.map.remove(this.entry.getValue());

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.sorting;
import java.util.ArrayList;
import java.util.HashSet;

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.sorting;
import java.util.ArrayList;
import java.util.Comparator;

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.sorting;
import java.util.Comparator;

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.sorting;
public interface ReversibleScoreMap<E> extends ScoreMap<E> {

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.sorting;
import java.util.Comparator;

@ -22,7 +22,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.sorting;
import java.util.Iterator;
import java.util.List;

@ -23,7 +23,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.ranking;
package net.yacy.cora.sorting;
import java.util.ArrayList;
import java.util.Comparator;

@ -0,0 +1,110 @@
/**
* AbstractMapStore
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.storage;
import java.util.AbstractMap;
import java.util.Collection;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.UTF8;
public abstract class AbstractMapStore implements MapStore {
@Override
public boolean containsValue(Object arg0) {
throw new UnsupportedOperationException("ContainsValue() not appropriate, use outer indexing");
}
@Override
public Set<java.util.Map.Entry<byte[], Map<String, byte[]>>> entrySet() {
throw new UnsupportedOperationException("entrySet() not appropriate, use an iterator");
}
@Override
public Set<byte[]> keySet() {
throw new UnsupportedOperationException("keySet() not appropriate, use an iterator");
}
@Override
public void putAll(Map<? extends byte[], ? extends Map<String, byte[]>> entries) {
if (entries instanceof MapStore) {
Iterator<Map.Entry<byte[], Map<String, byte[]>>> i = ((MapStore) entries).iterator();
Map.Entry<? extends byte[], ? extends Map<String, byte[]>> entry;
while (i.hasNext()) {
entry = i.next();
this.put(entry.getKey(), entry.getValue());
}
} else {
for (Map.Entry<? extends byte[], ? extends Map<String, byte[]>> e: entries.entrySet()) {
this.put(e.getKey(), e.getValue());
}
}
}
@Override
public Collection<Map<String, byte[]>> values() {
throw new UnsupportedOperationException("values() not appropriate, use an iterator");
}
@Override
public Iterator<Map.Entry<byte[], Map<String, byte[]>>> iterator() {
final Iterator<byte[]> k = this.keyIterator();
return new Iterator<Map.Entry<byte[], Map<String, byte[]>>>(){
@Override
public boolean hasNext() {
return k.hasNext();
}
@Override
public Map.Entry<byte[], Map<String, byte[]>> next() {
byte[] key = k.next();
if (key == null) return null;
return new AbstractMap.SimpleImmutableEntry<byte[], Map<String, byte[]>>(key, AbstractMapStore.this.get(key));
}
@Override
public void remove() {
k.remove();
}
};
}
public static String map2String(Map<String, byte[]> map) {
StringBuilder sb = new StringBuilder(map.size() * 50);
sb.append("<map>\n");
for (Map.Entry<String, byte[]> entry: map.entrySet()) {
sb.append('<').append(entry.getKey()).append('>');
sb.append(UTF8.String(entry.getValue()));
sb.append("</").append(entry.getKey()).append(">\n");
}
sb.append("</map>\n");
return sb.toString();
}
}

@ -0,0 +1,56 @@
/**
* MapStore
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.storage;
import java.util.Map;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
/**
* this is a placeholder interface
* for the complex expressionMap<byte[], Map<String, byte[]>>
*
*/
public interface MapStore extends Map<byte[], Map<String, byte[]>>, Iterable<Map.Entry<byte[], Map<String, byte[]>>> {
/**
* the map should have an ordering on the key elements
* @return a byte order on the key elements
*/
public ByteOrder getOrdering();
/**
* the keys of the map should be iterable
* @return an iterator on the map keys
*/
public CloneableIterator<byte[]> keyIterator();
/**
* most of the MapStore implementations are file-based, so we should consider a close method
*/
public void close();
}

@ -12,13 +12,13 @@ import java.util.Random;
import javax.imageio.ImageIO;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.index.Index;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.RowSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.table.SQLTable;
import net.yacy.kelondro.table.SplitTable;

@ -36,7 +36,7 @@ import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.ranking.OrderedScoreMap;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryControl;

@ -46,7 +46,7 @@ import java.util.regex.Pattern;
import javax.swing.event.EventListenerList;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.ranking.ClusteredScoreMap;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.document.SentenceReader;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.Evaluation.Element;
@ -115,17 +115,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
// class variables: collectors for links
private Map<MultiProtocolURI, Properties> anchors;
private final Map<MultiProtocolURI, Properties> anchors;
private final Map<MultiProtocolURI, String> rss, css;
private final Set<MultiProtocolURI> script, frames, iframes;
private Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final Map<String, String> metas;
private String title;
//private String headline;
private List<String>[] headlines;
private final ClusteredScoreMap<String> bold, italic;
private final List<String> li;
private CharBuffer content;
private final CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private float lon, lat;
private MultiProtocolURI canonical;
@ -187,23 +187,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.anchors.put(url, p0);
}
/*
private void mergeAnchors(final MultiProtocolURI url, final String key, final String value) {
if (value == null) return;
if (value.length() == 0) return;
Properties p0 = this.anchors.get(url);
if (p0 == null) {
p0 = new Properties();
p0.put(key, value);
this.anchors.put(url, p0);
return;
}
// merge properties
p0.put(key, value);
this.anchors.put(url, p0);
}
*/
@Override
public void scrapeText(final char[] newtext, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext));
int p, pl, q, s = 0;
@ -295,7 +279,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} catch (final MalformedURLException e) {}
}
// append string to content
if (b.length() != 0) this.content.append(b).append(32);
if (!b.isEmpty()) {
this.content.append(b);
this.content.appendSpace();
}
}
private final static Pattern dpssp = Pattern.compile("://");
@ -317,6 +304,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
@Override
public void scrapeTag0(final String tagname, final Properties tagopts) {
if (tagname.equalsIgnoreCase("img")) {
final String src = tagopts.getProperty("src", EMPTY_STRING);
@ -407,6 +395,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
fireScrapeTag0(tagname, tagopts);
}
@Override
public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
@ -481,6 +470,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
@Override
public void scrapeComment(final char[] comment) {
this.evaluationScores.match(Element.comment, comment);
}
@ -837,11 +827,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void close() {
// free resources
super.close();
this.anchors = null;
this.images = null;
this.anchors.clear();
this.images.clear();
this.title = null;
this.headlines = null;
this.content = null;
this.content.clear();
this.root = null;
}

@ -55,17 +55,18 @@ public class ContentTransformer extends AbstractTransformer implements Transform
super(linkTags0, linkTags1);
}
@Override
public void init(final String initarg) {
if (bluelist == null) {
if (this.bluelist == null) {
// here, the init arg is used to load a list of blue-listed words
bluelist = new ArrayList<String>();
this.bluelist = new ArrayList<String>();
final File f = new File(initarg);
if (f.canRead()) {
try {
final BufferedReader r = new BufferedReader(new FileReader(f));
String s;
while ((s = r.readLine()) != null) {
if (s.length() > 0 && s.charAt(0) != '#') bluelist.add(s.toLowerCase());
if (s.length() > 0 && s.charAt(0) != '#') this.bluelist.add(s.toLowerCase());
}
r.close();
} catch (final IOException e) {
@ -75,8 +76,9 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
}
@Override
public boolean isIdentityTransformer() {
return bluelist.isEmpty();
return this.bluelist.isEmpty();
}
private static char[] genBlueLetters(int length) {
@ -84,7 +86,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform
length = length / 2;
if (length > 10) length = 7;
while (length-- > 0) {
bb.append((int)'X');
bb.append('X');
}
bb.append("</FONT> ");
final char[] result = bb.getChars();
@ -97,16 +99,17 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
private boolean bluelistHit(final char[] text) {
if (text == null || bluelist == null) return false;
if (text == null || this.bluelist == null) return false;
final String lc = new String(text).toLowerCase();
for (int i = 0; i < bluelist.size(); i++) {
if (lc.indexOf(bluelist.get(i)) >= 0) return true;
for (int i = 0; i < this.bluelist.size(); i++) {
if (lc.indexOf(this.bluelist.get(i)) >= 0) return true;
}
return false;
}
@Override
public char[] transformText(final char[] text) {
if (bluelist != null) {
if (this.bluelist != null) {
if (bluelistHit(text)) {
// System.out.println("FILTERHIT: " + text);
return genBlueLetters(text.length);

@ -36,7 +36,7 @@ import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.ranking.ClusteredScoreMap;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryControl;

@ -82,7 +82,7 @@ public final class TransformerWriter extends Writer {
final Transformer transformer,
final boolean passbyIfBinarySuspect
) {
this(outStream, charSet, scraper, transformer, passbyIfBinarySuspect, 1024);
this(outStream, charSet, scraper, transformer, passbyIfBinarySuspect, 4096);
}
public TransformerWriter(
@ -115,9 +115,9 @@ public final class TransformerWriter extends Writer {
public static char[] genTag0raw(final String tagname, final boolean opening, final char[] tagopts) {
final CharBuffer bb = new CharBuffer(tagname.length() + tagopts.length + 3);
bb.append((int)'<');
bb.append('<');
if (!opening) {
bb.append((int)'/');
bb.append('/');
}
bb.append(tagname);
if (tagopts.length > 0) {
@ -125,7 +125,7 @@ public final class TransformerWriter extends Writer {
bb.append(tagopts);
// else bb.append((byte) 32).append(tagopts);
}
bb.append((int)'>');
bb.append('>');
final char[] result = bb.getChars();
try {
bb.close();
@ -137,15 +137,15 @@ public final class TransformerWriter extends Writer {
public static char[] genTag1raw(final String tagname, final char[] tagopts, final char[] text) {
final CharBuffer bb = new CharBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
bb.append((int)'<').append(tagname);
bb.append('<').append(tagname);
if (tagopts.length > 0) {
// if (tagopts[0] == (byte) 32)
bb.append(tagopts);
// else bb.append((byte) 32).append(tagopts);
}
bb.append((int)'>');
bb.append('>');
bb.append(text);
bb.append((int)'<').append((int)'/').append(tagname).append((int)'>');
bb.append('<').append('/').append(tagname).append('>');
final char[] result = bb.getChars();
try {
bb.close();
@ -158,12 +158,12 @@ public final class TransformerWriter extends Writer {
public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) {
final char[] tagoptsx = (tagopts.isEmpty()) ? null : genOpts(tagopts, quotechar);
final CharBuffer bb = new CharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
bb.append((int)'<').append(tagname);
bb.append('<').append(tagname);
if (tagoptsx != null) {
bb.append(32);
bb.appendSpace();
bb.append(tagoptsx);
}
bb.append((int)'>');
bb.append('>');
final char[] result = bb.getChars();
try {
bb.close();
@ -176,7 +176,7 @@ public final class TransformerWriter extends Writer {
public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
final char[] gt0 = genTag0(tagname, tagopts, quotechar);
final CharBuffer cb = new CharBuffer(gt0, gt0.length + text.length + tagname.length() + 3);
cb.append(text).append((int)'<').append((int)'/').append(tagname).append((int)'>');
cb.append(text).append('<').append('/').append(tagname).append('>');
final char[] result = cb.getChars();
try {
cb.close();
@ -193,9 +193,9 @@ public final class TransformerWriter extends Writer {
String key;
while (e.hasMoreElements()) {
key = (String) e.nextElement();
bb.append(32).append(key).append((int)'=').append((int)quotechar);
bb.appendSpace().append(key).append('=').append(quotechar);
bb.append(prop.getProperty(key));
bb.append((int)quotechar);
bb.append(quotechar);
}
final char[] result;
if (bb.length() > 0)
@ -530,12 +530,14 @@ public final class TransformerWriter extends Writer {
write(b, 0, b.length);
}
@Override
public void write(final char b[], final int off, final int len) throws IOException {
// System.out.println(UTF8.String(b, off, len));
if ((off | len | (b.length - (len + off)) | (off + len)) < 0) throw new IndexOutOfBoundsException();
for (int i = off ; i < (len - off) ; i++) this.write(b[i]);
}
@Override
public void flush() throws IOException {
// we cannot flush the current string this.buffer to prevent that
// the filter process is messed up
@ -544,6 +546,7 @@ public final class TransformerWriter extends Writer {
// if you want to flush all, call close() at end of writing;
}
@Override
public void close() throws IOException {
final char quotechar = (this.inSingleQuote) ? singlequote : doublequote;
if (this.buffer != null) {

@ -83,6 +83,7 @@ public class htmlParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("text/csv");
}
@Override
public Document[] parse(
final MultiProtocolURI location,
final String mimeType,
@ -196,7 +197,7 @@ public class htmlParser extends AbstractParser implements Parser {
// parsing the content
final ContentScraper scraper = new ContentScraper(location);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, sourceStream.available());
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(4096, sourceStream.available()));
try {
FileUtils.copy(sourceStream, writer, c);
} catch (final IOException e) {

@ -50,10 +50,10 @@ import java.util.concurrent.TimeUnit;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.MergeIterator;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.rwi.Reference;

@ -38,10 +38,12 @@ import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.storage.MapStore;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.BDecoder;
@ -53,9 +55,7 @@ import net.yacy.kelondro.util.FileUtils;
* store a table of properties (instead of fixed-field entries) this is realized using blobs and BEncoded
* property lists
*/
public class BEncodedHeap implements Map<byte[], Map<String, byte[]>>,
Iterable<Map.Entry<byte[], Map<String, byte[]>>>
{
public class BEncodedHeap implements MapStore {
private Heap table;
private final LinkedHashSet<String> columnames;
@ -89,6 +89,42 @@ public class BEncodedHeap implements Map<byte[], Map<String, byte[]>>,
this.columnames = new LinkedHashSet<String>();
}
@Override
public ByteOrder getOrdering() {
return this.table.ordering;
}
@Override
public CloneableIterator<byte[]> keyIterator() {
try {
return this.table.keys(true, false);
} catch (IOException e) {
Log.logSevere("BEncodedHeap", "returning empty iterator for failed key iteration: " + e.getMessage(), e);
return new CloneableIterator<byte[]>(){
@Override
public boolean hasNext() {
return false;
}
@Override
public byte[] next() {
return null;
}
@Override
public void remove() {
}
@Override
public CloneableIterator<byte[]> clone(Object modifier) {
return this;
}
};
}
}
public byte[] encodedKey(final String key) {
return Base64Order.enhancedCoder.encodeSubstring(Digest.encodeMD5Raw(key), this.table.keylength);
}
@ -487,7 +523,10 @@ public class BEncodedHeap implements Map<byte[], Map<String, byte[]>>,
* are flushed
*/
public void close() {
int s = this.size();
File f = this.table.heapFile;
this.table.close();
if (s == 0) f.delete();
}
/**

@ -0,0 +1,339 @@
/**
* BEncodedHeapBag
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.blob;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.storage.AbstractMapStore;
import net.yacy.cora.storage.MapStore;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.MergeIterator;
import net.yacy.kelondro.util.FileUtils;
public class BEncodedHeapBag extends AbstractMapStore implements MapStore {
private Map<String, BEncodedHeap> bag; // a map from a date string to a kelondroIndex object
private final File baseDir;
private final String prefix;
private final int keylength, buffermax;
private final ByteOrder entryOrder;
private String current;
private final long fileAgeLimit;
private final long fileSizeLimit;
public BEncodedHeapBag(
final File path,
final String prefix,
final int keylength,
final ByteOrder ordering,
final int buffermax,
final long fileAgeLimit,
final long fileSizeLimit) {
this.baseDir = path;
this.prefix = prefix;
this.keylength = keylength;
this.buffermax = buffermax;
this.entryOrder = ordering;
this.fileAgeLimit = fileAgeLimit;
this.fileSizeLimit = fileSizeLimit;
init();
}
private void init() {
this.current = null;
// initialized tables map
this.bag = new HashMap<String, BEncodedHeap>();
if (!(this.baseDir.exists())) this.baseDir.mkdirs();
String[] tablefile = this.baseDir.list();
// first pass: find tables
final HashMap<String, Long> t = new HashMap<String, Long>();
long ram, time, maxtime = 0;
Date d;
File f;
for (final String element : tablefile) {
if ((element.startsWith(this.prefix)) &&
(element.length() > this.prefix.length()) &&
(element.charAt(this.prefix.length()) == '.') &&
(element.length() == this.prefix.length() + 23)) {
f = new File(this.baseDir, element);
try {
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(element.substring(this.prefix.length() + 1, this.prefix.length() + 18));
} catch (final ParseException e) {
Log.logSevere("BEncodedHeapBag", "", e);
continue;
}
time = d.getTime();
if (time > maxtime) {
this.current = element;
assert this.current != null;
maxtime = time;
}
t.put(element, f.length());
}
}
// second pass: open tables
Iterator<Map.Entry<String, Long>> i;
Map.Entry<String, Long> entry;
String maxf;
long maxram;
while (!t.isEmpty()) {
// find maximum table
maxram = 0;
maxf = null;
i = t.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
ram = entry.getValue().longValue();
if (maxf == null || ram > maxram) {
maxf = entry.getKey();
maxram = ram;
}
}
// open next biggest table
t.remove(maxf);
f = new File(this.baseDir, maxf);
try {
Log.logInfo("BEncodedHeapBag", "opening partial heap " + f);
BEncodedHeap heap = new BEncodedHeap(f, this.keylength, this.entryOrder, this.buffermax);
this.bag.put(maxf, heap);
} catch (IOException e) {
Log.logSevere("BEncodedHeapBag", "error opening partial heap " + f);
}
}
}
@Override
public synchronized void close() {
if (this.bag == null) return;
final Iterator<BEncodedHeap> i = this.bag.values().iterator();
while (i.hasNext()) {
i.next().close();
}
this.bag = null;
}
@Override
public void clear() {
close();
final String[] l = this.baseDir.list();
for (final String element : l) {
if (element.startsWith(this.prefix)) {
final File f = new File(this.baseDir, element);
if (!f.isDirectory()) FileUtils.deletedelete(f);
}
}
init();
}
private MapStore keeperOf(final byte[] key) {
if (key == null) return null;
if (this.bag == null) return null;
for (final MapStore oi: this.bag.values()) {
if (oi.containsKey(key)) return oi;
}
return null;
}
private String newFilename() {
return this.prefix + "." + GenericFormatter.SHORT_MILSEC_FORMATTER.format() + ".heap";
}
private MapStore newHeap() {
this.current = newFilename();
final File f = new File(this.baseDir, this.current);
BEncodedHeap heap;
try {
heap = new BEncodedHeap(f, this.keylength, this.entryOrder, this.buffermax);
} catch (IOException e) {
Log.logSevere("BEncodedHeapBag", "unable to open new heap file: " + e.getMessage(), e);
return null;
}
this.bag.put(this.current, heap);
return heap;
}
private MapStore checkHeap(final BEncodedHeap heap) {
// check size and age of given table; in case it is too large or too old
// create a new table
assert heap != null;
long t = System.currentTimeMillis();
if (((t / 1000) % 10) != 0) return heap; // we check only every 10 seconds because all these file and parser operations are very expensive
final String name = heap.getFile().getName();
long d;
try {
d = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(name.substring(this.prefix.length() + 1, this.prefix.length() + 18)).getTime();
} catch (final ParseException e) {
Log.logSevere("BEncodedHeapBag", "", e);
d = 0;
}
if (d + this.fileAgeLimit < t || new File(this.baseDir, name).length() >= this.fileSizeLimit) {
return newHeap();
}
return heap;
}
@Override
public boolean containsKey(Object key) {
if (key == null || !(key instanceof byte[])) return false;
synchronized (this.bag) {
return keeperOf((byte[]) key) != null;
}
}
@Override
public Map<String, byte[]> put(byte[] key, Map<String, byte[]> map) {
if (this.bag == null) return null;
MapStore keeper = null;
synchronized (this.bag) {
keeper = keeperOf(key);
}
if (keeper != null) {
return keeper.put(key, map);
}
synchronized (this.bag) {
keeper = keeperOf(key); // we must check that again because it could have changed in between
if (keeper != null) return keeper.put(key, map);
if (this.current == null) {
keeper = newHeap();
return keeper.put(key, map);
}
keeper = checkHeap(this.bag.get(this.current));
}
return keeper.put(key, map);
}
@Override
public Map<String, byte[]> get(Object key) {
if (key == null || !(key instanceof byte[])) return null;
synchronized (this.bag) {
return keeperOf((byte[]) key).get(key);
}
}
@Override
public boolean isEmpty() {
final Iterator<BEncodedHeap> i = this.bag.values().iterator();
while (i.hasNext()) if (!i.next().isEmpty()) return false;
return true;
}
@Override
public int size() {
final Iterator<BEncodedHeap> i = this.bag.values().iterator();
int s = 0;
while (i.hasNext()) s += i.next().size();
return s;
}
@Override
public Map<String, byte[]> remove(Object key) {
if (key == null || !(key instanceof byte[])) return null;
final MapStore heap;
synchronized (this.bag) {
heap = keeperOf((byte[]) key);
}
if (heap == null) return null;
return heap.remove(key);
}
@Override
public ByteOrder getOrdering() {
return this.entryOrder;
}
@Override
public CloneableIterator<byte[]> keyIterator() {
final List<CloneableIterator<byte[]>> c = new ArrayList<CloneableIterator<byte[]>>(this.bag.size());
final Iterator<BEncodedHeap> i = this.bag.values().iterator();
CloneableIterator<byte[]> k;
while (i.hasNext()) {
k = i.next().keyIterator();
if (k != null && k.hasNext()) c.add(k);
}
return MergeIterator.cascade(c, this.entryOrder, MergeIterator.simpleMerge, true);
}
protected static Map<String, byte[]> testMap(int i) {
HashMap<String, byte[]> t = new HashMap<String, byte[]>();
t.put("rdf:about", UTF8.getBytes("http://abc.de/testmap#" + i));
t.put("dc:title", UTF8.getBytes("test nr " + i));
return t;
}
private static BEncodedHeapBag testHeapBag(File f) {
return new BEncodedHeapBag(
f,
"testbag",
12,
Base64Order.enhancedCoder,
10,
ArrayStack.oneMonth, 100 /*Integer.MAX_VALUE*/);
}
public static void main(String[] args) {
File f = new File("/tmp");
BEncodedHeapBag hb = testHeapBag(f);
for (int i = 0; i < 10000; i++) {
hb.put(Word.word2hash(Integer.toString(i)), testMap(i));
}
System.out.println("test size after put = " + hb.size());
hb.close();
hb = testHeapBag(f);
Iterator<Map.Entry<byte[], Map<String, byte[]>>> mi = hb.iterator();
int c = 1000;
Map.Entry<byte[], Map<String, byte[]>> entry;
while (mi.hasNext() && c-- > 0) {
entry = mi.next();
System.out.println(UTF8.String(entry.getKey()) + ": " + AbstractMapStore.map2String(entry.getValue()));
}
for (int i = 10000; i > 0; i--) {
hb.remove(Word.word2hash(Integer.toString(i - 1)));
}
System.out.println("test size after remove = " + hb.size());
hb.close();
Log.shutdown();
}
}

@ -0,0 +1,299 @@
/**
* BEncodedHeapShard
* Copyright 2011 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 16.12.2011 at http://yacy.net
*
* $LastChangedDate: 2011-04-14 00:04:23 +0200 (Do, 14 Apr 2011) $
* $LastChangedRevision: 7653 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.blob;
import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.storage.AbstractMapStore;
import net.yacy.cora.storage.MapStore;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.MergeIterator;
import net.yacy.kelondro.util.FileUtils;
public class BEncodedHeapShard extends AbstractMapStore implements MapStore {
public interface Method {
/**
* a sharding method produces a filename from a given key
* @param key
* @return
*/
public String filename(byte[] key);
/**
* get the maximum key length for access keys
* @return
*/
public int getKeylength();
/**
* get the byte order on the keys
* @return
*/
public ByteOrder getOrdering();
/**
* check if the given file name is a part of the shard
* @param filename
* @return true if the file is part of the shar
*/
public boolean isShardPart(String filename);
public String getShardName(String filename);
}
public static class B64ShardMethod implements Method {
private final int keylength;
private final ByteOrder ordering;
private final byte[] template;
private final int charpos;
private final String prefix;
public B64ShardMethod(
final int keylength,
final ByteOrder ordering,
final String prefix) {
this.keylength = keylength;
this.ordering = ordering;
this.template = ASCII.getBytes(prefix + ".?");
this.charpos = ASCII.getBytes(prefix).length + 1;
this.prefix = prefix;
}
@Override
public String filename(byte[] key) {
byte[] s = new byte[this.template.length];
System.arraycopy(this.template, 0, s, 0, s.length);
s[this.charpos] = key[0];
return ASCII.String(s);
}
@Override
public int getKeylength() {
return this.keylength;
}
@Override
public ByteOrder getOrdering() {
return this.ordering;
}
@Override
public boolean isShardPart(String filename) {
// TODO Auto-generated method stub
return filename.startsWith(this.prefix) &&
filename.charAt(this.prefix.length()) == '.' &&
filename.endsWith(".heap");
}
@Override
public String getShardName(String filename) {
return filename.substring(0, this.template.length);
}
}
private ConcurrentHashMap<String, MapStore> shard;
private final File baseDir;
private final Method shardMethod;
public BEncodedHeapShard(File baseDir, Method shardMethod) {
this.shard = new ConcurrentHashMap<String, MapStore>();
this.baseDir = baseDir;
this.shardMethod = shardMethod;
init();
}
private void init() {
// initialized tables map
this.shard = new ConcurrentHashMap<String, MapStore>();
if (!(this.baseDir.exists())) this.baseDir.mkdirs();
String[] tablefile = this.baseDir.list();
// open all tables of this shard
for (final String element : tablefile) {
if (this.shardMethod.isShardPart(element)) {
Log.logInfo("BEncodedHeapShard", "opening partial shard " + element);
MapStore bag = openBag(element);
this.shard.put(this.shardMethod.getShardName(element), bag);
}
}
}
@Override
public void close() {
if (this.shard == null) return;
final Iterator<MapStore> i = this.shard.values().iterator();
while (i.hasNext()) {
i.next().close();
}
this.shard = null;
}
@Override
public void clear() {
close();
final String[] l = this.baseDir.list();
for (final String element : l) {
if (this.shardMethod.isShardPart(element)) {
final File f = new File(this.baseDir, element);
if (!f.isDirectory()) FileUtils.deletedelete(f);
}
}
init();
}
private MapStore keeperOf(final byte[] key) {
String shardfile = this.shardMethod.filename(key);
MapStore bag = this.shard.get(shardfile);
if (bag != null) return bag;
bag = openBag(shardfile);
this.shard.put(shardfile, bag);
return bag;
}
public MapStore openBag(String shardfile) {
MapStore bag = new BEncodedHeapBag(
this.baseDir,
shardfile,
this.shardMethod.getKeylength(),
this.shardMethod.getOrdering(),
10,
ArrayStack.oneMonth * 12,
Integer.MAX_VALUE);
return bag;
}
@Override
public boolean containsKey(Object key) {
if (key == null || !(key instanceof byte[])) return false;
String shardfile = this.shardMethod.filename((byte[]) key);
MapStore bag = this.shard.get(shardfile);
if (bag == null) return false;
return bag.containsKey(key);
}
@Override
public Map<String, byte[]> put(byte[] key, Map<String, byte[]> map) {
if (this.shard == null) return null;
MapStore keeper = null;
synchronized (this.shard) {
keeper = keeperOf(key);
}
return keeper.put(key, map);
}
@Override
public Map<String, byte[]> get(Object key) {
if (key == null || !(key instanceof byte[])) return null;
String shardfile = this.shardMethod.filename((byte[]) key);
MapStore bag = this.shard.get(shardfile);
if (bag == null) return null;
return bag.get(key);
}
@Override
public boolean isEmpty() {
final Iterator<MapStore> i = this.shard.values().iterator();
while (i.hasNext()) if (!i.next().isEmpty()) return false;
return true;
}
@Override
public int size() {
final Iterator<MapStore> i = this.shard.values().iterator();
int s = 0;
while (i.hasNext()) s += i.next().size();
return s;
}
@Override
public Map<String, byte[]> remove(Object key) {
if (key == null || !(key instanceof byte[])) return null;
final MapStore bag;
synchronized (this.shard) {
bag = keeperOf((byte[]) key);
}
if (bag == null) return null;
return bag.remove(key);
}
@Override
public ByteOrder getOrdering() {
return this.shardMethod.getOrdering();
}
@Override
public CloneableIterator<byte[]> keyIterator() {
final List<CloneableIterator<byte[]>> c = new ArrayList<CloneableIterator<byte[]>>(this.shard.size());
final Iterator<MapStore> i = this.shard.values().iterator();
CloneableIterator<byte[]> k;
while (i.hasNext()) {
k = i.next().keyIterator();
if (k != null && k.hasNext()) c.add(k);
}
return MergeIterator.cascade(c, this.shardMethod.getOrdering(), MergeIterator.simpleMerge, true);
}
private static BEncodedHeapShard testHeapShard(File f) {
return new BEncodedHeapShard(f, new B64ShardMethod(12, Base64Order.enhancedCoder, "testshard"));
}
public static void main(String[] args) {
File f = new File("/tmp");
BEncodedHeapShard hb = testHeapShard(f);
for (int i = 0; i < 10000; i++) {
hb.put(Word.word2hash(Integer.toString(i)), BEncodedHeapBag.testMap(i));
}
System.out.println("test size after put = " + hb.size());
hb.close();
hb = testHeapShard(f);
Iterator<Map.Entry<byte[], Map<String, byte[]>>> mi = hb.iterator();
int c = 100;
Map.Entry<byte[], Map<String, byte[]>> entry;
while (mi.hasNext() && c-- > 0) {
entry = mi.next();
System.out.println(UTF8.String(entry.getKey()) + ": " + AbstractMapStore.map2String(entry.getValue()));
}
for (int i = 10000; i > 0; i--) {
hb.remove(Word.word2hash(Integer.toString(i - 1)));
}
System.out.println("test size after remove = " + hb.size());
hb.close();
Log.shutdown();
}
}

@ -28,9 +28,9 @@ package net.yacy.kelondro.blob;
import java.io.IOException;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.CloneableIterator;
public interface BLOB {

@ -37,10 +37,10 @@ import java.util.TreeMap;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.kelondro.util.MemoryControl;

@ -35,11 +35,11 @@ import java.util.SortedMap;
import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.io.AbstractWriter;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.MemoryControl;
@ -125,12 +125,12 @@ public final class Heap extends HeapModifier implements BLOB {
*/
@Override
public boolean containsKey(byte[] key) {
assert index != null;
assert this.index != null;
key = normalizeKey(key);
synchronized (this) {
// check the buffer
assert buffer != null;
if (buffer != null) {
assert this.buffer != null;
if (this.buffer != null) {
if (this.buffer.containsKey(key)) return true;
}
return super.containsKey(key);
@ -147,13 +147,13 @@ public final class Heap extends HeapModifier implements BLOB {
private void add(byte[] key, final byte[] blob) throws IOException {
assert blob.length > 0;
if ((blob == null) || (blob.length == 0)) return;
final int pos = (int) file.length();
final int pos = (int) this.file.length();
try {
index.put(key, pos);
file.seek(pos);
file.writeInt(this.keylength + blob.length);
file.write(key);
file.write(blob, 0, blob.length);
this.index.put(key, pos);
this.file.seek(pos);
this.file.writeInt(this.keylength + blob.length);
this.file.write(key);
this.file.write(blob, 0, blob.length);
} catch (RowSpaceExceededException e) {
throw new IOException(e.getMessage()); // should never occur;
}
@ -166,8 +166,8 @@ public final class Heap extends HeapModifier implements BLOB {
* @throws RowSpaceExceededException
*/
public void flushBuffer() throws IOException {
assert buffer != null;
if (buffer == null) return;
assert this.buffer != null;
if (this.buffer == null) return;
// check size of buffer
Iterator<Map.Entry<byte[], byte[]>> i = this.buffer.entrySet().iterator();
@ -196,18 +196,18 @@ public final class Heap extends HeapModifier implements BLOB {
// append all contents of the buffer into one byte[]
i = this.buffer.entrySet().iterator();
final long pos = file.length();
final long pos = this.file.length();
long posFile = pos;
posBuffer = 0;
byte[] ba = new byte[l + (4 + this.keylength) * this.buffer.size()];
byte[] b;
SortedMap<byte[], byte[]> nextBuffer = new TreeMap<byte[], byte[]>(ordering);
SortedMap<byte[], byte[]> nextBuffer = new TreeMap<byte[], byte[]>(this.ordering);
flush: while (i.hasNext()) {
entry = i.next();
key = normalizeKey(entry.getKey());
blob = entry.getValue();
try {
index.put(key, posFile);
this.index.put(key, posFile);
} catch (RowSpaceExceededException e) {
nextBuffer.put(entry.getKey(), blob);
continue flush;
@ -245,8 +245,8 @@ public final class Heap extends HeapModifier implements BLOB {
synchronized (this) {
// check the buffer
assert buffer != null;
if (buffer != null) {
assert this.buffer != null;
if (this.buffer != null) {
byte[] blob = this.buffer.get(key);
if (blob != null) return blob;
}
@ -267,8 +267,8 @@ public final class Heap extends HeapModifier implements BLOB {
synchronized (this) {
// check the buffer
assert buffer != null;
if (buffer != null) {
assert this.buffer != null;
if (this.buffer != null) {
byte[] blob = this.buffer.get(key);
if (blob != null) return blob.length;
}
@ -284,8 +284,8 @@ public final class Heap extends HeapModifier implements BLOB {
@Override
public synchronized void clear() throws IOException {
Log.logInfo("Heap", "clearing heap " + this.name());
assert buffer != null;
if (buffer == null) buffer = new TreeMap<byte[], byte[]>(ordering);
assert this.buffer != null;
if (this.buffer == null) this.buffer = new TreeMap<byte[], byte[]>(this.ordering);
this.buffer.clear();
this.buffersize = 0;
super.clear();
@ -297,7 +297,7 @@ public final class Heap extends HeapModifier implements BLOB {
@Override
public synchronized void close(final boolean writeIDX) {
Log.logInfo("Heap", "closing heap " + this.name());
if (file != null && buffer != null) {
if (this.file != null && this.buffer != null) {
try {
flushBuffer();
} catch (IOException e) {
@ -306,7 +306,7 @@ public final class Heap extends HeapModifier implements BLOB {
}
this.buffer = null;
super.close(writeIDX);
assert file == null;
assert this.file == null;
}
@Override
@ -351,11 +351,11 @@ public final class Heap extends HeapModifier implements BLOB {
assert this.buffer != null;
// if there is not enough space in the buffer, flush all
if (this.buffersize + b.length > buffermax || MemoryControl.shortStatus()) {
if (this.buffersize + b.length > this.buffermax || MemoryControl.shortStatus()) {
// this is too big. Flush everything
super.shrinkWithGapsAtEnd();
flushBuffer();
if (b.length > buffermax) {
if (b.length > this.buffermax) {
this.add(key, b);
} else {
if (this.buffer != null) {
@ -387,6 +387,7 @@ public final class Heap extends HeapModifier implements BLOB {
final int reclen = b.length + this.keylength;
Map.Entry<Long, Integer> entry;
Iterator<Map.Entry<Long, Integer>> i = this.free.entrySet().iterator();
int acount = 0, bcount = 0;
while (i.hasNext()) {
entry = i.next();
if (entry.getValue().intValue() == reclen) {
@ -397,14 +398,14 @@ public final class Heap extends HeapModifier implements BLOB {
this.index.put(key, entry.getKey());
// write to file
file.seek(entry.getKey().longValue());
final int reclenf = file.readInt();
this.file.seek(entry.getKey().longValue());
final int reclenf = this.file.readInt();
assert reclenf == reclen;
file.write(key);
this.file.write(key);
if (this.keylength > key.length) {
for (int j = 0; j < this.keylength - key.length; j++) file.write(HeapWriter.ZERO);
for (int j = 0; j < this.keylength - key.length; j++) this.file.write(HeapWriter.ZERO);
}
file.write(b);
this.file.write(b);
// remove the entry from the free list
i.remove();
@ -414,10 +415,13 @@ public final class Heap extends HeapModifier implements BLOB {
// finished!
return true;
}
acount++;
// look for the biggest size
if (entry.getValue() > lsize) {
if (entry.getValue().intValue() > lsize) {
lseek = entry.getKey();
lsize = entry.getValue();
bcount++;
if (acount > 100 || bcount > 10) break; // in case that we have really a lot break here
}
}
@ -429,21 +433,21 @@ public final class Heap extends HeapModifier implements BLOB {
// data structure in the file
// write the new entry
file.seek(lseek);
file.writeInt(reclen);
file.write(key);
this.file.seek(lseek);
this.file.writeInt(reclen);
this.file.write(key);
if (this.keylength > key.length) {
for (int j = 0; j < this.keylength - key.length; j++) file.write(HeapWriter.ZERO);
for (int j = 0; j < this.keylength - key.length; j++) this.file.write(HeapWriter.ZERO);
}
file.write(b);
this.file.write(b);
// add the index to the new entry
index.put(key, lseek);
this.index.put(key, lseek);
// define the new empty entry
final int newfreereclen = lsize - reclen - 4;
assert newfreereclen > 0;
file.writeInt(newfreereclen);
this.file.writeInt(newfreereclen);
// remove the old free entry
this.free.remove(lseek);
@ -473,8 +477,8 @@ public final class Heap extends HeapModifier implements BLOB {
super.deleteFingerprint();
// check the buffer
assert buffer != null;
if (buffer != null) {
assert this.buffer != null;
if (this.buffer != null) {
byte[] blob = this.buffer.remove(key);
if (blob != null) {
this.buffersize -= blob.length;

@ -29,10 +29,10 @@ import java.io.IOException;
import java.util.SortedMap;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.io.CachedFileWriter;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;

@ -37,13 +37,13 @@ import java.util.concurrent.ExecutionException;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.index.HandleMap;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.io.CachedFileWriter;
import net.yacy.kelondro.io.Writer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.order.RotateIterator;

@ -31,10 +31,10 @@ import java.io.FileOutputStream;
import java.io.IOException;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.kelondro.index.HandleMap;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.util.FileUtils;

@ -35,16 +35,15 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.ranking.ClusteredScoreMap;
import net.yacy.cora.ranking.ConcurrentScoreMap;
import net.yacy.cora.ranking.ScoreMap;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.util.LookAheadIterator;
public class MapDataMining extends MapHeap {
@ -105,7 +104,7 @@ public class MapDataMining extends MapHeap {
}
// fill cluster and accumulator with values
if ((sortfields != null) || (longaccfields != null) || (floataccfields != null)) try {
if (sortfields != null || longaccfields != null || floataccfields != null) try {
final CloneableIterator<byte[]> it = super.keys(true, false);
byte[] mapnameb;
String cell;
@ -122,37 +121,49 @@ public class MapDataMining extends MapHeap {
}
if (map == null) break;
if (sortfields != null && cluster != null) for (int i = 0; i < sortfields.length; i++) {
cell = map.get(sortfields[i]);
if (cell != null) cluster[i].set(UTF8.String(mapnameb), ClusteredScoreMap.object2score(cell));
if (sortfields != null && cluster != null) {
for (int i = 0; i < sortfields.length; i++) {
cell = map.get(sortfields[i]);
if (cell != null) cluster[i].set(UTF8.String(mapnameb), ClusteredScoreMap.object2score(cell));
}
}
if (longaccfields != null && longaccumulator != null) for (int i = 0; i < longaccfields.length; i++) {
cell = map.get(longaccfields[i]);
valuel = 0;
if (cell != null) try {
valuel = Long.parseLong(cell);
longaccumulator[i] = Long.valueOf(longaccumulator[i].longValue() + valuel);
} catch (final NumberFormatException e) {}
if (longaccfields != null && longaccumulator != null) {
for (int i = 0; i < longaccfields.length; i++) {
cell = map.get(longaccfields[i]);
valuel = 0;
if (cell != null) try {
valuel = Long.parseLong(cell);
longaccumulator[i] = Long.valueOf(longaccumulator[i].longValue() + valuel);
} catch (final NumberFormatException e) {}
}
}
if (floataccfields != null && floataccumulator != null) for (int i = 0; i < floataccfields.length; i++) {
cell = map.get(floataccfields[i]);
valued = 0f;
if (cell != null) try {
valued = Float.parseFloat(cell);
floataccumulator[i] = new Float(floataccumulator[i].floatValue() + valued);
} catch (final NumberFormatException e) {}
if (floataccfields != null && floataccumulator != null) {
for (int i = 0; i < floataccfields.length; i++) {
cell = map.get(floataccfields[i]);
valued = 0f;
if (cell != null) try {
valued = Float.parseFloat(cell);
floataccumulator[i] = new Float(floataccumulator[i].floatValue() + valued);
} catch (final NumberFormatException e) {}
}
}
}
} catch (final IOException e) {}
// fill cluster
if (sortfields != null && cluster != null) for (int i = 0; i < sortfields.length; i++) this.sortClusterMap.put(sortfields[i], cluster[i]);
if (sortfields != null && cluster != null) {
for (int i = 0; i < sortfields.length; i++) this.sortClusterMap.put(sortfields[i], cluster[i]);
}
// fill acc map
if (longaccfields != null && longaccumulator != null) for (int i = 0; i < longaccfields.length; i++) this.accLong.put(longaccfields[i], longaccumulator[i]);
if (floataccfields != null && floataccumulator != null) for (int i = 0; i < floataccfields.length; i++) this.accFloat.put(floataccfields[i], floataccumulator[i]);
if (longaccfields != null && longaccumulator != null) {
for (int i = 0; i < longaccfields.length; i++) this.accLong.put(longaccfields[i], longaccumulator[i]);
}
if (floataccfields != null && floataccumulator != null) {
for (int i = 0; i < floataccfields.length; i++) this.accFloat.put(floataccfields[i], floataccumulator[i]);
}
}
@Override
@ -213,7 +224,7 @@ public class MapDataMining extends MapHeap {
float valued;
Long longaccumulator;
Float floataccumulator;
if (this.longaccfields != null)
if (this.longaccfields != null) {
for (final String longaccfield : this.longaccfields) {
value = map.get(longaccfield);
if (value != null) {
@ -228,7 +239,8 @@ public class MapDataMining extends MapHeap {
} catch (final NumberFormatException e) {}
}
}
if (this.floataccfields != null)
}
if (this.floataccfields != null) {
for (final String floataccfield : this.floataccfields) {
value = map.get(floataccfield);
if (value != null) {
@ -243,6 +255,7 @@ public class MapDataMining extends MapHeap {
} catch (final NumberFormatException e) {}
}
}
}
}
private void updateSortCluster(final String key, final Map<String, String> map) {
@ -283,23 +296,6 @@ public class MapDataMining extends MapHeap {
super.delete(key);
}
/* would be better but does not work (recursion)
@Override
public synchronized void delete(final byte[] key) throws IOException {
if (key == null) return;
// update elementCount
Map<String, String> map = super.remove(key);
if (map != null && (sortfields != null || longaccfields != null || floataccfields != null)) {
// update accumulators (subtract)
if ((longaccfields != null) || (floataccfields != null)) updateAcc(map, false);
// remove from sortCluster
if (sortfields != null) deleteSortCluster(UTF8.String(key));
}
}
*/
private void deleteSortCluster(final String key) {
if (key == null) return;
ScoreMap<String> cluster;
@ -327,34 +323,34 @@ public class MapDataMining extends MapHeap {
this.s = s;
}
@Override
public boolean hasNext() {
return this.s.hasNext();
}
@Override
public byte[] next() {
final String r = this.s.next();
if (r == null) return null;
return UTF8.getBytes(r);
}
@Override
public void remove() {
this.s.remove();
}
}
public synchronized mapIterator maps(final boolean up, final String field) {
return new mapIterator(keys(up, field));
@Override
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final String whereKey, final String isValue) throws IOException {
return super.entries(whereKey, isValue);
}
public synchronized mapIterator maps(final boolean up, final boolean rotating) throws IOException {
return new mapIterator(keys(up, rotating));
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final boolean up, final String field) {
return new MapIterator(keys(up, field), null, null);
}
public synchronized mapIterator maps(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException {
return new mapIterator(keys(up, rotating, firstKey, secondKey));
}
public synchronized long getLongAcc(final String field) {
final Long accumulator = this.accLong.get(field);
if (accumulator == null) return -1;
@ -389,50 +385,33 @@ public class MapDataMining extends MapHeap {
super.close();
}
public class mapIterator extends LookAheadIterator<Map<String, String>> implements Iterator<Map<String, String>> {
// enumerates Map-Type elements
// the key is also included in every map that is returned; it's key is 'key'
/*
public byte[] lookupBy(
final String whereKey,
final String isValue
) {
private final Iterator<byte[]> keyIterator;
private mapIterator(final Iterator<byte[]> keyIterator) {
this.keyIterator = keyIterator;
}
public Map<String, String> next0() {
if (this.keyIterator == null) return null;
byte[] nextKey;
Map<String, String> map;
while (this.keyIterator.hasNext()) {
nextKey = this.keyIterator.next();
try {
map = get(nextKey, false);
} catch (final IOException e) {
Log.logWarning("MapDataMining", e.getMessage());
continue;
} catch (final RowSpaceExceededException e) {
Log.logException(e);
continue;
}
if (map == null) continue; // circumvention of a modified exception
map.put("key", UTF8.String(nextKey));
return map;
}
return null;
}
} // class mapIterator
}
*/
public static void main(final String[] args) {
try {
final MapDataMining db = new MapDataMining(new File("/tmp/MapDataMinig.test.db"), Word.commonHashLength, Base64Order.enhancedCoder, 1024 * 512, 500, new String[] {"X"}, new String[] {"X"}, new String[] {}, null);
final Map<String, String> m1 = new HashMap<String, String>(); m1.put("X", Long.toString(System.currentTimeMillis()));
File f = new File("/tmp/MapDataMinig.test.db");
f.delete();
final MapDataMining db = new MapDataMining(f, Word.commonHashLength, Base64Order.enhancedCoder, 1024 * 512, 500, new String[] {"X"}, new String[] {"X"}, new String[] {}, null);
final Map<String, String> m1 = new HashMap<String, String>();
long t = System.currentTimeMillis();
m1.put("X", Long.toString(t));
db.put("abcdefghijk1".getBytes(), m1);
final Map<String, String> m2 = new HashMap<String, String>(); m2.put("X", Long.toString(System.currentTimeMillis() - 1000));
final Map<String, String> m2 = new HashMap<String, String>();
m2.put("X", Long.toString(t - 1000));
db.put("abcdefghijk2".getBytes(), m2);
final Map<String, String> m3 = new HashMap<String, String>(); m3.put("X", Long.toString(System.currentTimeMillis() + 2000));
final Map<String, String> m3 = new HashMap<String, String>();
m3.put("X", Long.toString(t + 2000));
db.put("abcdefghijk3".getBytes(), m3);
// iterate the keys, sorted by field X in ascending order (must be: abcdefghijk2 - abcdefghijk1 - abcdefghijk3)
final Iterator<byte[]> i1 = db.keys(true, "X");
byte[] k;
while (i1.hasNext()) {
@ -440,11 +419,12 @@ public class MapDataMining extends MapHeap {
System.out.println(new String(k));
}
final Iterator<Map<String, String>> i2 = db.maps(false, "X");
Map<String, String> e;
// iterate the maps, sorted by field X in descending order (must be: abcdefghijk3 - abcdefghijk1 - abcdefghijk2)
final Iterator<Map.Entry<byte[], Map<String, String>>> i2 = db.entries(false, "X");
Map.Entry<byte[], Map<String, String>> e;
while (i2.hasNext()) {
e = i2.next();
System.out.println(e);
System.out.println(UTF8.String(e.getKey()) + ":" + e.getValue());
}
System.exit(0);

@ -32,6 +32,7 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.AbstractMap;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
@ -42,17 +43,17 @@ import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.order.RotateIterator;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.LookAheadIterator;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.kelondroException;
public class MapHeap implements Map<byte[], Map<String, String>> {
@ -85,6 +86,7 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
* clears the content of the database
* @throws IOException
*/
@Override
public synchronized void clear() {
try {
this.blob.clear();
@ -168,6 +170,7 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
}
}
@Override
public Map<String, String> put(final byte[] key, final Map<String, String> newMap) {
Map<String, String> v = null;
try {
@ -200,6 +203,7 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
}
}
@Override
public Map<String, String> remove(final Object key) {
Map<String, String> v = null;
try {
@ -218,6 +222,7 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
* @throws IOException
*/
@Override
public boolean containsKey(final Object k) {
if (!(k instanceof byte[])) return false;
assert k != null;
@ -241,6 +246,7 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
return get(key, true);
}
@Override
public Map<String, String> get(final Object key) {
if (key == null) return null;
try {
@ -375,6 +381,7 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
this.iterator = (rotating) ? new RotateIterator<byte[]>(i, secondKey, MapHeap.this.blob.size()) : i;
}
@Override
public byte[] next() {
return removeFillchar(this.iterator.next());
}
@ -399,23 +406,29 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
}
}
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final String whereKey, final String isValue) throws IOException {
return new MapIterator(this.blob.keys(true, null), whereKey, isValue);
}
public synchronized MapIterator entries(final boolean up, final boolean rotating) throws IOException {
return new MapIterator(keys(up, rotating));
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final boolean up, final boolean rotating) throws IOException {
return new MapIterator(keys(up, rotating), null, null);
}
public synchronized MapIterator entries(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException {
return new MapIterator(keys(up, rotating, firstKey, secondKey));
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final boolean up, final boolean rotating, final byte[] firstKey, final byte[] secondKey) throws IOException {
return new MapIterator(keys(up, rotating, firstKey, secondKey), null, null);
}
/**
* ask for the number of entries
* @return the number of entries in the table
*/
@Override
public synchronized int size() {
return (this.blob == null) ? 0 : this.blob.size();
}
@Override
public synchronized boolean isEmpty() {
return (this.blob == null) ? true : this.blob.isEmpty();
}
@ -435,48 +448,52 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
public void finalize() {
close();
}
public class MapIterator implements Iterator<Map<String, String>> {
public class MapIterator extends LookAheadIterator<Map.Entry<byte[], Map<String, String>>> implements Iterator<Map.Entry<byte[], Map<String, String>>> {
// enumerates Map-Type elements
// the key is also included in every map that is returned; it's key is 'key'
Iterator<byte[]> keyIterator;
boolean finish;
private final Iterator<byte[]> keyIterator;
private final String whereKey, isValue;
public MapIterator(final Iterator<byte[]> keyIterator) {
MapIterator(final Iterator<byte[]> keyIterator, final String whereKey, final String isValue) {
this.keyIterator = keyIterator;
this.finish = false;
}
public boolean hasNext() {
return (!(this.finish)) && (this.keyIterator.hasNext());
this.whereKey = whereKey;
this.isValue = isValue;
}
public Map<String, String> next() {
byte[] nextKey = this.keyIterator.next();
if (nextKey == null) {
this.finish = true;
return null;
}
nextKey = normalizeKey(nextKey); // the key must be normalized because the keyIterator may iterate over not-normalized keys
try {
final Map<String, String> obj = get(nextKey, false);
if (obj == null) throw new kelondroException("no more elements available");
return obj;
} catch (final IOException e) {
this.finish = true;
return null;
} catch (final RowSpaceExceededException e) {
this.finish = true;
return null;
@Override
public Map.Entry<byte[], Map<String, String>> next0() {
if (this.keyIterator == null) return null;
byte[] nextKey;
Map<String, String> map;
while (this.keyIterator.hasNext()) {
nextKey = this.keyIterator.next();
try {
map = get(nextKey, false);
} catch (final IOException e) {
Log.logWarning("MapDataMining", e.getMessage());
continue;
} catch (final RowSpaceExceededException e) {
Log.logException(e);
continue;
}
if (map == null) continue; // circumvention of a modified exception
// check if the where case holds
if (this.whereKey != null && this.isValue != null) {
String v = map.get(this.whereKey);
if (v == null) continue;
if (!v.equals(this.isValue)) continue;
}
// produce entry
Map.Entry<byte[], Map<String, String>> entry = new AbstractMap.SimpleImmutableEntry<byte[], Map<String, String>>(nextKey, map);
return entry;
}
}
public void remove() {
throw new UnsupportedOperationException();
return null;
}
} // class mapIterator
@Override
public void putAll(final Map<? extends byte[], ? extends Map<String, String>> map) {
for (final Map.Entry<? extends byte[], ? extends Map<String, String>> me: map.entrySet()) {
try {
@ -489,6 +506,7 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
}
}
@Override
public Set<byte[]> keySet() {
final TreeSet<byte[]> set = new TreeSet<byte[]>(this.blob.ordering());
try {
@ -498,16 +516,19 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
return set;
}
@Override
public Collection<Map<String, String>> values() {
// this method shall not be used because it is not appropriate for this kind of data
throw new UnsupportedOperationException();
}
@Override
public Set<java.util.Map.Entry<byte[], Map<String, String>>> entrySet() {
// this method shall not be used because it is not appropriate for this kind of data
throw new UnsupportedOperationException();
}
@Override
public boolean containsValue(final Object value) {
// this method shall not be used because it is not appropriate for this kind of data
throw new UnsupportedOperationException();

@ -30,6 +30,8 @@ package net.yacy.kelondro.data.meta;
import java.io.File;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
@ -71,6 +73,37 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
}
return (url == null) ? null : ASCII.String(url.hash(), 6, 6);
}
/**
* from a given list of hosts make a list of host hashes
* the list is separated by comma
* @param hostlist
* @return list of host hashes without separation
*/
public static String hosthashes(final String hostlist) {
String[] hs = hostlist.split(",");
StringBuilder sb = new StringBuilder(hostlist.length());
for (String h: hs) {
if (h == null) continue;
h = h.trim();
if (h.length() == 0) continue;
h = hosthash(h);
if (h == null || h.length() != 6) continue;
sb.append(h);
}
return sb.toString();
}
public static Set<String> hosthashess(String hosthashes) {
if (hosthashes == null || hosthashes.length() == 0) return null;
HashSet<String> h = new HashSet<String>();
assert hosthashes.length() % 6 == 0;
for (int i = 0; i < hosthashes.length(); i = i + 6) {
h.add(hosthashes.substring(i, i + 6));
}
return h;
}
/**
* DigestURI from File
@ -180,8 +213,10 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
int p = (this.host == null) ? -1 : this.host.lastIndexOf('.');
String dom = (p > 0) ? dom = this.host.substring(0, p) : "";
p = dom.lastIndexOf('.'); // locate subdomain
String subdom = "";
if (p > 0) {
final String subdom;
if (p <= 0) {
subdom = "";
} else {
subdom = dom.substring(0, p);
dom = dom.substring(p + 1);
}

@ -189,12 +189,15 @@ public class URIMetadataRow implements URIMetadata {
final float lat,
final float lon) {
final CharBuffer s = new CharBuffer(360);
s.append(url.toNormalform(false, true)).append(10);
s.append(dc_title).append(10);
s.append(dc_creator.length() > 80 ? dc_creator.substring(0, 80) : dc_creator).append(10);
s.append(dc_subject.length() > 120 ? dc_subject.substring(0, 120) : dc_subject).append(10);
s.append(dc_publisher).append(10);
if (lon == 0.0f && lat == 0.0f) s.append(10); else s.append(Float.toString(lat)).append(',').append(Float.toString(lon)).append(10);
s.append(url.toNormalform(false, true)).appendLF();
s.append(dc_title).appendLF();
if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator);
s.appendLF();
if (dc_subject.length() > 120) s.append(dc_subject, 0, 120); else s.append(dc_subject);
s.appendLF();
if (dc_publisher.length() > 80) s.append(dc_publisher, 0, 80); else s.append(dc_publisher);
s.appendLF();
if (lon == 0.0f && lat == 0.0f) s.appendLF(); else s.append(Float.toString(lat)).append(',').append(Float.toString(lon)).appendLF();
return UTF8.getBytes(s.toString());
}
@ -375,7 +378,39 @@ public class URIMetadataRow implements URIMetadata {
return this.ranking;
}
public Components metadata() {
public boolean matches(final Pattern matcher) {
return this.metadata().matches(matcher);
}
public DigestURI url() {
return this.metadata().url();
}
public String dc_title() {
return this.metadata().dc_title();
}
public String dc_creator() {
return this.metadata().dc_creator();
}
public String dc_publisher() {
return this.metadata().dc_publisher();
}
public String dc_subject() {
return this.metadata().dc_subject();
}
public float lat() {
return this.metadata().lat();
}
public float lon() {
return this.metadata().lon();
}
private Components metadata() {
// avoid double computation of metadata elements
if (this.comp != null) return this.comp;
// parse elements from comp field;
@ -428,7 +463,7 @@ public class URIMetadataRow implements URIMetadata {
public byte[] language() {
byte[] b = this.entry.getColBytes(col_lang, true);
if (b[0] == (byte)'[') {
if (b == null || b[0] == (byte)'[') {
String tld = this.metadata().url.getTLD();
if (tld.length() < 2 || tld.length() > 2) return ASCII.getBytes("en");
return ASCII.getBytes(tld);
@ -542,7 +577,7 @@ public class URIMetadataRow implements URIMetadata {
//return "{" + core + "}";
}
public class Components {
private class Components {
private DigestURI url;
private String urlRaw;
private byte[] urlHash;

@ -116,6 +116,13 @@ public class Word {
return word2hash(word.toString());
}
private final static byte lowByte = Base64Order.alpha_enhanced[0];
private final static byte highByte = Base64Order.alpha_enhanced[Base64Order.alpha_enhanced.length - 1];
public static boolean isPrivate(byte[] hash) {
return hash[0] == highByte && hash[1] == highByte && hash[2] == highByte && hash[3] == highByte && hash[4] == highByte;
}
// create a word hash
public static final byte[] word2hash(final String word) {
final String wordlc = word.toLowerCase(Locale.ENGLISH);
@ -123,6 +130,12 @@ public class Word {
if (h != null) return h;
// calculate the hash
h = Base64Order.enhancedCoder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength);
while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) {
// ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer
// statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never)
System.arraycopy(h, 1, h, 0, commonHashLength - 1);
h[commonHashLength - 1] = lowByte;
}
assert h[2] != '@';
if (MemoryControl.shortStatus()) {
hashCache.clear();
@ -133,6 +146,16 @@ public class Word {
return h;
}
public final static byte PRIVATE_TYPE_COPY = 'C'; // used for a private local copy of the index
public final static byte PRIVATE_TYPE_PHONETIC = 'K'; // used for ColognePhonetics
public static final byte[] hash2private(final byte[] hash, byte privateType) {
byte[] p = new byte[commonHashLength];
p[0] = highByte; p[1] = highByte; p[2] = highByte; ; p[3] = highByte; ; p[4] = highByte; p[5] = privateType;
System.arraycopy(hash, 0, p, 6, commonHashLength - 6); // 36 bits left for private hashes should be enough
return p;
}
public static final HandleSet words2hashesHandles(final Set<String> words) {
final HandleSet hashes = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.size());
for (final String word: words)

@ -32,7 +32,7 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.cora.order.ByteOrder;
import net.yacy.kelondro.order.NaturalOrder;

@ -32,9 +32,9 @@ import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.MergeIterator;
/**

@ -33,8 +33,8 @@ import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.util.MemoryControl;

@ -45,9 +45,9 @@ import java.util.concurrent.LinkedBlockingQueue;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.CloneableIterator;
public final class HandleMap implements Iterable<Row.Entry> {

@ -34,9 +34,9 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.Iterator;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.util.SetTools;

@ -30,7 +30,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.cora.order.CloneableIterator;
public interface Index extends Iterable<Row.Entry> {

@ -32,8 +32,8 @@ import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.MergeIterator;
import net.yacy.kelondro.order.StackIterator;

@ -32,9 +32,9 @@ import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.MergeIterator;
import net.yacy.kelondro.order.StackIterator;

@ -37,12 +37,12 @@ import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.ranking.AbstractOrder;
import net.yacy.cora.ranking.Order;
import net.yacy.cora.order.AbstractOrder;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.Order;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.kelondroException;

@ -34,12 +34,12 @@ import java.util.concurrent.Callable;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.sorting.Array;
import net.yacy.cora.sorting.Sortable;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;

@ -35,9 +35,9 @@ import java.util.TreeMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.CloneableIterator;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.MemoryControl;

@ -45,9 +45,9 @@ public final class CharBuffer extends Writer {
public CharBuffer() {
buffer = new char[10];
length = 0;
offset = 0;
this.buffer = new char[10];
this.length = 0;
this.offset = 0;
}
public CharBuffer(final int initLength) {
@ -57,44 +57,44 @@ public final class CharBuffer extends Writer {
}
public CharBuffer(final char[] bb) {
buffer = bb;
length = bb.length;
offset = 0;
this.buffer = bb;
this.length = bb.length;
this.offset = 0;
}
public CharBuffer(final char[] bb, final int initLength) {
this.buffer = new char[initLength];
System.arraycopy(bb, 0, buffer, 0, bb.length);
length = bb.length;
offset = 0;
System.arraycopy(bb, 0, this.buffer, 0, bb.length);
this.length = bb.length;
this.offset = 0;
}
public CharBuffer(final char[] bb, final int of, final int le) {
if (of * 2 > bb.length) {
buffer = new char[le];
System.arraycopy(bb, of, buffer, 0, le);
length = le;
offset = 0;
this.buffer = new char[le];
System.arraycopy(bb, of, this.buffer, 0, le);
this.length = le;
this.offset = 0;
} else {
buffer = bb;
length = le;
offset = of;
this.buffer = bb;
this.length = le;
this.offset = of;
}
}
public CharBuffer(final CharBuffer bb) {
buffer = bb.buffer;
length = bb.length;
offset = bb.offset;
this.buffer = bb.buffer;
this.length = bb.length;
this.offset = bb.offset;
}
public CharBuffer(final File f) throws IOException {
// initially fill the buffer with the content of a file
if (f.length() > Integer.MAX_VALUE) throw new IOException("file is too large for buffering");
length = 0;
buffer = new char[(int) f.length()*2];
offset = 0;
this.length = 0;
this.buffer = new char[(int) f.length()*2];
this.offset = 0;
FileReader fr = null;
try {
@ -114,52 +114,66 @@ public final class CharBuffer extends Writer {
public void clear() {
this.buffer = new char[0];
length = 0;
offset = 0;
this.length = 0;
this.offset = 0;
}
public int length() {
return length;
return this.length;
}
private void grow(int minSize) {
int newsize = buffer.length + 1024;
int newsize = this.buffer.length + 1024;
if (newsize < minSize) newsize = minSize+1;
char[] tmp = new char[newsize];
System.arraycopy(buffer, offset, tmp, 0, length);
buffer = tmp;
offset = 0;
System.arraycopy(this.buffer, this.offset, tmp, 0, this.length);
this.buffer = tmp;
this.offset = 0;
}
@Override
public void write(final int b) {
write((char)b);
}
public void write(final char b) {
if (offset + length + 1 > buffer.length) grow(offset + length + 1);
buffer[offset + length++] = b;
if (this.offset + this.length + 1 > this.buffer.length) grow(this.offset + this.length + 1);
this.buffer[this.offset + this.length++] = b;
}
@Override
public void write(final char[] bb) {
write(bb, 0, bb.length);
}
@Override
public void write(final char[] bb, final int of, final int le) {
if (offset + length + le > buffer.length) grow(offset + length + le);
System.arraycopy(bb, of, buffer, offset + length, le);
length += le;
if (this.offset + this.length + le > this.buffer.length) grow(this.offset + this.length + le);
System.arraycopy(bb, of, this.buffer, this.offset + this.length, le);
this.length += le;
}
private static final char SPACE = ' ';
private static final char CR = (char) 13;
private static final char LF = (char) 10;
public CharBuffer appendSpace() {
write(SPACE);
return this;
}
// do not use/implement the following method, a
// "overridden method is a bridge method"
// will occur
// public serverCharBuffer append(char b) {
// write(b);
// return this;
// }
public CharBuffer appendCR() {
write(CR);
return this;
}
public CharBuffer appendLF() {
write(LF);
return this;
}
public CharBuffer append(final int i) {
write((char) (i));
write((char) i);
return this;
}
@ -173,6 +187,7 @@ public final class CharBuffer extends Writer {
return this;
}
@Override
public CharBuffer append(final char c) {
write(c);
return this;
@ -180,39 +195,36 @@ public final class CharBuffer extends Writer {
public CharBuffer append(final String s) {
final char[] temp = new char[s.length()];
s.getChars(0, temp.length, temp, 0);
return append(temp);
s.getChars(0, temp.length, temp, 0);
write(temp);
return this;
}
public CharBuffer append(final String s, final int off, final int len) {
final char[] temp = new char[len];
s.getChars(off, (off + len), temp, 0);
return append(temp);
s.getChars(off, (off + len), temp, 0);
write(temp);
return this;
}
public CharBuffer append(final CharBuffer bb) {
return append(bb.buffer, bb.offset, bb.length);
write(bb.buffer, bb.offset, bb.length);
return this;
}
// public serverCharBuffer append(Object o) {
// if (o instanceof String) return append((String) o);
// if (o instanceof char[]) return append((char[]) o);
// return null;
// }
public char charAt(final int pos) {
if (pos < 0) throw new IndexOutOfBoundsException();
if (pos > length) throw new IndexOutOfBoundsException();
return buffer[offset + pos];
if (pos > this.length) throw new IndexOutOfBoundsException();
return this.buffer[this.offset + pos];
}
public void deleteCharAt(final int pos) {
if (pos < 0) return;
if (pos >= length) return;
if (pos == length - 1) {
length--;
if (pos >= this.length) return;
if (pos == this.length - 1) {
this.length--;
} else {
System.arraycopy(buffer, offset + pos + 1, buffer, offset + pos, length - pos - 1);
System.arraycopy(this.buffer, this.offset + pos + 1, this.buffer, this.offset + pos, this.length - pos - 1);
}
}
@ -225,20 +237,20 @@ public final class CharBuffer extends Writer {
}
public int indexOf(final char b, final int start) {
if (start >= length) return -1;
for (int i = start; i < length; i++) if (buffer[offset + i] == b) return i;
if (start >= this.length) return -1;
for (int i = start; i < this.length; i++) if (this.buffer[this.offset + i] == b) return i;
return -1;
}
public int indexOf(final char[] bs, final int start) {
if (start + bs.length > length) return -1;
loop: for (int i = start; i <= length - bs.length; i++) {
if (start + bs.length > this.length) return -1;
loop: for (int i = start; i <= this.length - bs.length; i++) {
// first test only first char
if (buffer[offset + i] != bs[0]) continue loop;
if (this.buffer[this.offset + i] != bs[0]) continue loop;
// then test all remaining char
for (int j = 1; j < bs.length; j++) {
if (buffer[offset + i + j] != bs[j]) continue loop;
if (this.buffer[this.offset + i + j] != bs[j]) continue loop;
}
// found hit
@ -278,14 +290,14 @@ public final class CharBuffer extends Writer {
}
public int lastIndexOf(final char b) {
for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] == b) return i;
for (int i = this.length - 1; i >= 0; i--) if (this.buffer[this.offset + i] == b) return i;
return -1;
}
public boolean startsWith(final char[] bs) {
if (length < bs.length) return false;
if (this.length < bs.length) return false;
for (int i = 0; i < bs.length; i++) {
if (buffer[offset + i] != bs[i]) return false;
if (this.buffer[this.offset + i] != bs[i]) return false;
}
return true;
}
@ -295,45 +307,45 @@ public final class CharBuffer extends Writer {
}
public char[] getChars(final int start) {
return getChars(start, length);
return getChars(start, this.length);
}
public char[] getChars(final int start, final int end) {
// start is inclusive, end is exclusive
if (end > length) throw new IndexOutOfBoundsException("getBytes: end > length");
if (start > length) throw new IndexOutOfBoundsException("getBytes: start > length");
if (end > this.length) throw new IndexOutOfBoundsException("getBytes: end > length");
if (start > this.length) throw new IndexOutOfBoundsException("getBytes: start > length");
final char[] tmp = new char[end - start];
System.arraycopy(buffer, offset + start, tmp, 0, end - start);
System.arraycopy(this.buffer, this.offset + start, tmp, 0, end - start);
return tmp;
}
public byte[] getBytes() {
return UTF8.getBytes(new String(buffer, offset, length));
return UTF8.getBytes(new String(this.buffer, this.offset, this.length));
}
public CharBuffer trim(final int start) {
// the end value is outside (+1) of the wanted target array
if (start > length) throw new IndexOutOfBoundsException("trim: start > length");
offset = offset + start;
length = length - start;
if (start > this.length) throw new IndexOutOfBoundsException("trim: start > length");
this.offset = this.offset + start;
this.length = this.length - start;
return this;
}
public CharBuffer trim(final int start, final int end) {
// the end value is outside (+1) of the wanted target array
if (start > length) throw new IndexOutOfBoundsException("trim: start > length");
if (end > length) throw new IndexOutOfBoundsException("trim: end > length");
if (start > this.length) throw new IndexOutOfBoundsException("trim: start > length");
if (end > this.length) throw new IndexOutOfBoundsException("trim: end > length");
if (start > end) throw new IndexOutOfBoundsException("trim: start > end");
offset = offset + start;
length = end - start;
this.offset = this.offset + start;
this.length = end - start;
return this;
}
public CharBuffer trim() {
int l = 0;
while ((l < length) && (buffer[offset + l] <= ' ')) l++;
int r = length;
while ((r > 0) && (buffer[offset + r - 1] <= ' ')) r--;
while ((l < this.length) && (this.buffer[this.offset + l] <= ' ')) l++;
int r = this.length;
while ((r > 0) && (this.buffer[this.offset + r - 1] <= ' ')) r--;
if (l > r) r = l;
return trim(l, r);
}
@ -342,12 +354,12 @@ public final class CharBuffer extends Writer {
// returns true, if trim() would result in an empty serverByteBuffer
if (includeNonLetterBytes) {
char b;
for (int i = 0; i < length; i++) {
b = buffer[offset + i];
for (int i = 0; i < this.length; i++) {
b = this.buffer[this.offset + i];
if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) return false;
}
} else {
for (int i = 0; i < length; i++) if (buffer[offset + i] > 32) return false;
for (int i = 0; i < this.length; i++) if (this.buffer[this.offset + i] > 32) return false;
}
return true;
}
@ -356,86 +368,87 @@ public final class CharBuffer extends Writer {
// returns number of whitespace char at the beginning of text
if (includeNonLetterBytes) {
char b;
for (int i = 0; i < length; i++) {
b = buffer[offset + i];
for (int i = 0; i < this.length; i++) {
b = this.buffer[this.offset + i];
if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) return i;
}
} else {
for (int i = 0; i < length; i++) if (buffer[offset + i] > 32) return i;
for (int i = 0; i < this.length; i++) if (this.buffer[this.offset + i] > 32) return i;
}
return length;
return this.length;
}
public int whitespaceEnd(final boolean includeNonLetterBytes) {
// returns position of whitespace at the end of text
if (includeNonLetterBytes) {
char b;
for (int i = length - 1; i >= 0; i--) {
b = buffer[offset + i];
for (int i = this.length - 1; i >= 0; i--) {
b = this.buffer[this.offset + i];
if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) return i + 1;
}
} else {
for (int i = length - 1; i >= 0; i--) if (buffer[offset + i] > 32) return i + 1;
for (int i = this.length - 1; i >= 0; i--) if (this.buffer[this.offset + i] > 32) return i + 1;
}
return 0;
}
@Override
public String toString() {
return new String(buffer, offset, length);
return new String(this.buffer, this.offset, this.length);
}
public String toString(final int left, final int rightbound) {
return new String(buffer, offset + left, rightbound - left);
return new String(this.buffer, this.offset + left, rightbound - left);
}
public Properties propParser() {
// extract a=b or a="b" - relations from the buffer
int pos = offset;
int pos = this.offset;
int start;
String key;
final Properties p = new Properties();
// eat up spaces at beginning
while ((pos < length) && (buffer[pos] <= 32)) pos++;
while (pos < length) {
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
while (pos < this.length) {
// pos is at start of next key
start = pos;
while ((pos < length) && (buffer[pos] != equal)) pos++;
if (pos >= length) break; // this is the case if we found no equal
key = new String(buffer, start, pos - start).trim().toLowerCase();
while ((pos < this.length) && (this.buffer[pos] != equal)) pos++;
if (pos >= this.length) break; // this is the case if we found no equal
key = new String(this.buffer, start, pos - start).trim().toLowerCase();
// we have a key
pos++;
// find start of value
while ((pos < length) && (buffer[pos] <= 32)) pos++;
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
// doublequotes are obligatory. However, we want to be fuzzy if they
// are ommittet
if (pos >= length) {
if (pos >= this.length) {
// error case: input ended too early
break;
} else if (buffer[pos] == doublequote) {
} else if (this.buffer[pos] == doublequote) {
// search next doublequote
pos++;
start = pos;
while ((pos < length) && (buffer[pos] != doublequote)) pos++;
if (pos >= length) break; // this is the case if we found no parent doublequote
p.setProperty(key, new String(buffer, start, pos - start).trim());
while ((pos < this.length) && (this.buffer[pos] != doublequote)) pos++;
if (pos >= this.length) break; // this is the case if we found no parent doublequote
p.setProperty(key, new String(this.buffer, start, pos - start).trim());
pos++;
} else if (buffer[pos] == singlequote) {
} else if (this.buffer[pos] == singlequote) {
// search next singlequote
pos++;
start = pos;
while ((pos < length) && (buffer[pos] != singlequote)) pos++;
if (pos >= length) break; // this is the case if we found no parent singlequote
p.setProperty(key, new String(buffer, start, pos - start).trim());
while ((pos < this.length) && (this.buffer[pos] != singlequote)) pos++;
if (pos >= this.length) break; // this is the case if we found no parent singlequote
p.setProperty(key, new String(this.buffer, start, pos - start).trim());
pos++;
} else {
// search next whitespace
start = pos;
while ((pos < length) && (buffer[pos] > 32)) pos++;
p.setProperty(key, new String(buffer, start, pos - start).trim());
while ((pos < this.length) && (this.buffer[pos] > 32)) pos++;
p.setProperty(key, new String(this.buffer, start, pos - start).trim());
}
// pos should point now to a whitespace: eat up spaces
while ((pos < length) && (buffer[pos] <= 32)) pos++;
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
// go on with next loop
}
return p;
@ -475,10 +488,12 @@ public final class CharBuffer extends Writer {
return newbuf;
}
@Override
public void close() throws IOException {
buffer = null; // assist with garbage collection
this.buffer = null; // assist with garbage collection
}
@Override
public void flush() throws IOException {
// TODO Auto-generated method stub
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save