- enhanced html parser: recognized much more details in the content

- added more properties to solr index
- refactoring
- more constants in switchboard
- fix for some NPEs
- recognition of more images
- removed synchronization in HandleMap (obviously not necessary?)
- added a nolocal configuration to remove excessive dns lookup (works only on allip - default off). Indexes produced with this setting are all flagged with 'local' and are (on purpose) not usable for freeworld because they will be rejected as beeing local.



git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7672 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent bc84d2bc9d
commit b77b8cac0c

@ -98,6 +98,13 @@ filesize.max.other = 8589934591
network.unit.definition = defaults/yacy.network.freeworld.unit
#network.unit.definition = defaults/yacy.network.intranet.unit
# distinguish intranet/internet IPs:
# if this setting is set to true, then only URL-Hashes with 'intranet'-Flag is created, even if the
# url is in the internet. This can be done to enhance the crawling speed dramatically since a DNS-lookup
# to check if a host is in the internet oder an intranet can be omited.
# This option is only valid if the network.unit.domain property is set to 'any'
network.unit.domain.nocheck = false
# Update process properties
# The update server location is given in the network.unit.definition,
# but the settings for update processing and cycles are individual.

@ -190,7 +190,7 @@ public class ConfigNetwork_p {
prop.putHTML("network.unit.definition", sb.getConfig("network.unit.definition", ""));
prop.putHTML("network.unit.name", sb.getConfig(SwitchboardConstants.NETWORK_NAME, ""));
prop.putHTML("network.unit.description", sb.getConfig("network.unit.description", ""));
prop.putHTML("network.unit.domain", sb.getConfig("network.unit.domain", ""));
prop.putHTML("network.unit.domain", sb.getConfig(SwitchboardConstants.NETWORK_DOMAIN, ""));
prop.putHTML("network.unit.dht", sb.getConfig("network.unit.dht", ""));
networkBootstrapLocations.remove(sb.getConfig("network.unit.definition", ""));
int c = 0;

@ -34,6 +34,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@ -414,7 +415,7 @@ public class Crawler_p {
writer.close();
// get links and generate filter
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());
final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString());
@ -492,7 +493,7 @@ public class Crawler_p {
// String description = scraper.getDescription();
// get links and generate filter
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
final Map<MultiProtocolURI, Properties> hyperlinks = scraper.getAnchors();
if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet());
// put links onto crawl queue
@ -515,10 +516,10 @@ public class Crawler_p {
cachePolicy);
sb.crawler.putActive(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
final Iterator<Map.Entry<MultiProtocolURI, Properties>> linkiterator = hyperlinks.entrySet().iterator();
DigestURI nexturl;
while (linkiterator.hasNext()) {
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
final Map.Entry<MultiProtocolURI, Properties> e = linkiterator.next();
if (e.getKey() == null) continue;
nexturl = new DigestURI(e.getKey());
// remove the url from the database to be prepared to crawl them again
@ -530,7 +531,7 @@ public class Crawler_p {
sb.peers.mySeed().hash.getBytes(),
nexturl,
null,
e.getValue(),
e.getValue().getProperty("name", ""),
new Date(),
profile.handle(),
0,

@ -35,6 +35,7 @@ import java.net.UnknownHostException;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
@ -231,7 +232,7 @@ public final class CrawlStacker {
}
}
}
public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map<MultiProtocolURI, String> hyperlinks, boolean replace) {
public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map<MultiProtocolURI, Properties> hyperlinks, boolean replace) {
new Thread() {
public void run() {
enqueueEntries(initiator, profileHandle, hyperlinks, true);
@ -239,8 +240,8 @@ public final class CrawlStacker {
}.start();
}
public void enqueueEntries(byte[] initiator, String profileHandle, Map<MultiProtocolURI, String> hyperlinks, boolean replace) {
for (Map.Entry<MultiProtocolURI, String> e: hyperlinks.entrySet()) {
public void enqueueEntries(byte[] initiator, String profileHandle, Map<MultiProtocolURI, Properties> hyperlinks, boolean replace) {
for (Map.Entry<MultiProtocolURI, Properties> e: hyperlinks.entrySet()) {
if (e.getKey() == null) continue;
// delete old entry, if exists to force a re-load of the url (thats wanted here)
@ -272,7 +273,7 @@ public final class CrawlStacker {
initiator,
url,
null,
e.getValue(),
e.getValue().getProperty("name", ""),
new Date(),
profileHandle,
0,

@ -305,7 +305,7 @@ public class RobotsTxt {
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress.");
// sending the get request
robotsTxt = client.GETbytes(robotsURL.toString());
robotsTxt = client.GETbytes(robotsURL);
// statistics:
if (robotsTxt != null) {
ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length);

@ -53,7 +53,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
private static final int EcoFSBufferSize = 2000;
private static final int maxStackSize = 1000;
public final static Row rowdef = new Row(
private final static Row rowdef = new Row(
"String urlhash-" + Word.commonHashLength + ", " + // the url's hash
"String executor-" + Word.commonHashLength + ", " + // the crawling executor
"Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load
@ -64,8 +64,8 @@ public class ZURL implements Iterable<ZURL.Entry> {
);
// the class object
protected Index urlIndex;
protected final ConcurrentLinkedQueue<byte[]> stack;
private Index urlIndex;
private final ConcurrentLinkedQueue<byte[]> stack;
public ZURL(
final File cachePath,

@ -125,7 +125,7 @@ public final class HTTPLoader {
client.setTimout(socketTimeout);
client.setHeader(requestHeader.entrySet());
// send request
final byte[] responseBody = client.GETbytes(url.toString(), maxFileSize);
final byte[] responseBody = client.GETbytes(url, maxFileSize);
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final int code = client.getHttpResponse().getStatusLine().getStatusCode();
@ -241,7 +241,7 @@ public final class HTTPLoader {
final HTTPClient client = new HTTPClient();
client.setTimout(20000);
client.setHeader(requestHeader.entrySet());
final byte[] responseBody = client.GETbytes(request.url().toString(), Long.MAX_VALUE);
final byte[] responseBody = client.GETbytes(request.url(), Long.MAX_VALUE);
final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders());
final int code = client.getHttpResponse().getStatusLine().getStatusCode();
// FIXME: 30*-handling (bottom) is never reached

@ -36,6 +36,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;
import java.util.Map.Entry;
@ -130,7 +131,7 @@ public class BookmarkHelper {
int importCount = 0;
Map<MultiProtocolURI, String> links = new HashMap<MultiProtocolURI, String>();
Map<MultiProtocolURI, Properties> links = new HashMap<MultiProtocolURI, Properties>();
String title;
MultiProtocolURI url;
Bookmark bm;
@ -144,9 +145,9 @@ public class BookmarkHelper {
writer.close();
links = scraper.getAnchors();
} catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
for (final Entry<MultiProtocolURI, String> link: links.entrySet()) {
for (final Entry<MultiProtocolURI, Properties> link: links.entrySet()) {
url = link.getKey();
title = link.getValue();
title = link.getValue().getProperty("name", "");
Log.logInfo("BOOKMARKS", "links.get(url)");
if ("".equals(title)) {//cannot be displayed
title = url.toString();

@ -140,6 +140,15 @@ public final class Cache {
if (responseHeader == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: responseHeader == null");
if (file == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: file == null");
log.logInfo("storing content of url " + url.toString() + ", " + file.length + " bytes");
// store the file
try {
fileDB.insert(url.hash(), file);
} catch (UnsupportedEncodingException e) {
throw new IOException("Cache.store: cannot write to fileDB (1): " + e.getMessage());
} catch (IOException e) {
throw new IOException("Cache.store: cannot write to fileDB (2): " + e.getMessage());
}
// store the response header into the header database
final HashMap<String, String> hm = new HashMap<String, String>();
@ -154,15 +163,6 @@ public final class Cache {
} catch (Exception e) {
throw new IOException("Cache.store: cannot write to headerDB: " + e.getMessage());
}
// store the file
try {
fileDB.insert(url.hash(), file);
} catch (UnsupportedEncodingException e) {
throw new IOException("Cache.store: cannot write to fileDB (1): " + e.getMessage());
} catch (IOException e) {
throw new IOException("Cache.store: cannot write to fileDB (2): " + e.getMessage());
}
if (log.isFine()) log.logFine("stored in cache: " + url.toNormalform(true, false));
}
@ -173,8 +173,11 @@ public final class Cache {
*/
public static boolean has(final DigestURI url) {
boolean headerExists;
headerExists = responseHeaderDB.containsKey(url.hash());
boolean fileExists = fileDB.containsKey(url.hash());
boolean fileExists;
//synchronized (responseHeaderDB) {
headerExists = responseHeaderDB.containsKey(url.hash());
fileExists = fileDB.containsKey(url.hash());
//}
if (headerExists && fileExists) return true;
if (!headerExists && !fileExists) return false;
// if not both is there then we do a clean-up

@ -342,8 +342,8 @@ public class Segment {
Response.docType(document.dc_format()), // doctype
condenser.RESULT_FLAGS, // flags
UTF8.getBytes(language), // language
document.inboundLinks(), // inbound links
document.outboundLinks(), // outbound links
document.inboundLinkCount(), // inbound links
document.outboundLinkCount(), // outbound links
document.getAudiolinks().size(), // laudio
document.getImages().size(), // limage
document.getVideolinks().size(), // lvideo
@ -363,8 +363,8 @@ public class Segment {
condenser, // document condenser
language, // document language
Response.docType(document.dc_format()), // document type
document.inboundLinks(), // inbound links
document.outboundLinks(), // outbound links
document.inboundLinkCount(), // inbound links
document.outboundLinkCount(), // outbound links
searchEvent, // a search event that can have results directly
sourceName // the name of the source where the index was created
);

@ -574,6 +574,9 @@ public final class Switchboard extends serverSwitch {
isGlobalMode(),
this.domainList); // Intranet and Global mode may be both true!
// possibly switch off localIP check
Domains.setNoLocalCheck(this.isAllIPMode());
// check status of account configuration: when local url crawling is allowed, it is not allowed
// that an automatic authorization of localhost is done, because in this case crawls from local
// addresses are blocked to prevent attack szenarios where remote pages contain links to localhost
@ -828,7 +831,7 @@ public final class Switchboard extends serverSwitch {
setConfig(plasmaSwitchboardConstants.INDEX_RECEIVE_ALLOW, true);
}
*/
MultiProtocolURI.addBotInfo(getConfig(SwitchboardConstants.NETWORK_NAME, "") + (isRobinsonMode() ? "-" : "/") + getConfig("network.unit.domain", "global"));
MultiProtocolURI.addBotInfo(getConfig(SwitchboardConstants.NETWORK_NAME, "") + (isRobinsonMode() ? "-" : "/") + getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global"));
}
@ -941,11 +944,13 @@ public final class Switchboard extends serverSwitch {
this.crawler,
this.indexSegments.segment(Segments.Process.LOCALCRAWLING),
this.peers,
"local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"local.any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0,
"global.any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0,
this.domainList);
}
Domains.setNoLocalCheck(this.isAllIPMode()); // possibly switch off localIP check
// start up crawl jobs
continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
@ -1022,13 +1027,25 @@ public final class Switchboard extends serverSwitch {
}
public boolean isIntranetMode() {
return "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0;
return "local.any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0;
}
public boolean isGlobalMode() {
return "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0;
return "global.any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0;
}
public boolean isAllIPMode() {
return "any".indexOf(getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global")) >= 0;
}
/**
* in nocheck mode the isLocal property is not checked to omit DNS lookup. Can only be done in allip mode
* @return
*/
public boolean isIPNoCheckMode() {
return isAllIPMode() && getConfigBool(SwitchboardConstants.NETWORK_DOMAIN_NOCHECK, false);
}
public boolean isRobinsonMode() {
// we are in robinson mode, if we do not exchange index by dht distribution
// we need to take care that search requests and remote indexing requests go only
@ -1893,9 +1910,13 @@ public final class Switchboard extends serverSwitch {
for (Document doc: in.documents) {
try {
String id = UTF8.String(new DigestURI(doc.dc_identifier(), null).hash());
assert id.equals(UTF8.String(in.queueEntry.url().hash()));
String iquh = UTF8.String(in.queueEntry.url().hash());
if (!id.equals(iquh)) {
log.logWarning("doc=" + id + ":" + doc.dc_identifier() + ", query=" + iquh + ":" + in.queueEntry.url());
// in case that this happens it appears that the doc id is the right one
}
try {
this.solrConnector.add(id, doc);
this.solrConnector.add(id, in.queueEntry.getResponseHeader(), doc);
} catch (IOException e) {
Log.logWarning("SOLR", "failed to send " + in.queueEntry.url().toNormalform(true, false) + " to solr: " + e.getMessage());
}
@ -1951,9 +1972,7 @@ public final class Switchboard extends serverSwitch {
assert in.queueEntry != null;
assert in.documents != null;
assert in.queueEntry != null;
final Integer[] ioLinks = webStructure.generateCitationReference(in.queueEntry.url(), in.documents[i], (in.condenser == null) ? null : in.condenser[i], in.queueEntry.lastModified()); // [outlinksSame, outlinksOther]
in.documents[i].setInboundLinks(ioLinks[0].intValue());
in.documents[i].setOutboundLinks(ioLinks[1].intValue());
webStructure.generateCitationReference(in.queueEntry.url(), in.documents[i], (in.condenser == null) ? null : in.condenser[i], in.queueEntry.lastModified()); // [outlinksSame, outlinksOther]
}
return in;
}
@ -2621,7 +2640,7 @@ public final class Switchboard extends serverSwitch {
yacyCore.log.logInfo("BOOTSTRAP: seed-list URL " + seedListFileURL + " too old (" + (header.age() / 86400000) + " days)");
} else {
ssc++;
final byte[] content = client.GETbytes(url.toString());
final byte[] content = client.GETbytes(url);
enu = FileUtils.strings(content);
lc = 0;
while (enu.hasNext()) {
@ -2746,7 +2765,7 @@ public final class Switchboard extends serverSwitch {
client.setHeader(reqHeader.entrySet());
try {
// sending request
final Map<String, String> result = FileUtils.table(client.GETbytes(url.toString()));
final Map<String, String> result = FileUtils.table(client.GETbytes(url));
return (result == null) ? new HashMap<String, String>() : result;
} catch (final Exception e) {
Log.logException(e);

@ -395,6 +395,8 @@ public final class SwitchboardConstants {
*
*/
public static final String NETWORK_NAME = "network.unit.name";
public static final String NETWORK_DOMAIN = "network.unit.domain";
public static final String NETWORK_DOMAIN_NOCHECK = "network.unit.domain.nocheck";
public static final String NETWORK_WHITELIST = "network.unit.access.whitelist";
public static final String NETWORK_BLACKLIST = "network.unit.access.blacklist";

@ -128,39 +128,31 @@ public class WebStructureGraph {
}
}
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(final DigestURI url, final Document document, final Condenser condenser, final Date docDate) {
public void generateCitationReference(final DigestURI url, final Document document, final Condenser condenser, final Date docDate) {
// generate citation reference
final Map<MultiProtocolURI, String> hl = document.getHyperlinks();
final Iterator<MultiProtocolURI> it = hl.keySet().iterator();
final HashSet<MultiProtocolURI> globalRefURLs = new HashSet<MultiProtocolURI>();
final String refhost = url.getHost();
MultiProtocolURI u;
int GCount = 0;
int LCount = 0;
while (it.hasNext()) {
u = it.next();
if (u == null) continue;
if (refhost != null && u.getHost() != null && u.getHost().equals(refhost)) {
// this is a local link
LCount++;
} else {
if (refhost != null && u.getHost() != null && !u.getHost().equals(refhost)) {
// this is a global link
GCount++;
globalRefURLs.add(u);
}
}
leanrefObject lro = new leanrefObject(url, globalRefURLs);
if (globalRefURLs.size() > 0) try {
if (this.publicRefDNSResolvingWorker.isAlive()) {
this.publicRefDNSResolvingQueue.put(new leanrefObject(url, globalRefURLs));
this.publicRefDNSResolvingQueue.put(lro);
} else {
this.learnrefs(new leanrefObject(url, globalRefURLs));
this.learnrefs(lro);
}
} catch (InterruptedException e) {
this.learnrefs(new leanrefObject(url, globalRefURLs));
this.learnrefs(lro);
}
return new Integer[] {Integer.valueOf(LCount), Integer.valueOf(GCount)};
}
public void learnrefs(final leanrefObject lro) {

@ -39,6 +39,7 @@ import java.security.SignatureException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
@ -239,7 +240,7 @@ public final class yacyRelease extends yacyVersion {
}
// analyze links in scraper resource, and find link to latest release in it
final Map<MultiProtocolURI, String> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
final Map<MultiProtocolURI, Properties> anchors = scraper.getAnchors(); // a url (String) / name (String) relation
final TreeSet<yacyRelease> mainReleases = new TreeSet<yacyRelease>();
final TreeSet<yacyRelease> devReleases = new TreeSet<yacyRelease>();
for (MultiProtocolURI url : anchors.keySet()) {

@ -71,6 +71,7 @@ import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.util.MapTools;
import net.yacy.kelondro.util.OS;
import de.anomic.search.Switchboard;
import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
import de.anomic.yacy.dht.FlatWordPartitionScheme;
@ -844,8 +845,9 @@ public class yacySeed implements Cloneable, Comparable<yacySeed>, Comparator<yac
if (ipString.length() > 0 && ipString.length() < 8) return ipString + " -> IP is too short: ";
InetAddress ip = Domains.dnsResolve(ipString);
if (ip == null) return ipString + " -> IP is not proper"; //this does not work with staticIP
if (ipString.equals("localhost") || ipString.startsWith("127.") || ipString.startsWith("0:0:0:0:0:0:0:1")) return ipString + " - IP for localhost rejected";
return null;
if (Switchboard.getSwitchboard().isAllIPMode()) return null;
boolean islocal = Domains.isLocal(ip);
return (!islocal && Switchboard.getSwitchboard().isGlobalMode() || (islocal && Switchboard.getSwitchboard().isIntranetMode())) ? null : ipString + " - IP for localhost rejected";
}
@Override

@ -892,7 +892,7 @@ public final class yacySeedDB implements AlternativeDomainNames {
byte[] content = null;
try {
// send request
content = client.GETbytes(seedURL.toString());
content = client.GETbytes(seedURL);
} catch (final Exception e) {
throw new IOException("Unable to download seed file '" + seedURL + "'. " + e.getMessage());
}

@ -1142,7 +1142,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
client.setTimout(timeout);
client.setUserAgent(userAgent);
client.setHost(this.getHost());
return new ByteArrayInputStream(client.GETbytes(this.toNormalform(false, false)));
return new ByteArrayInputStream(client.GETbytes(this));
}
return null;
@ -1163,7 +1163,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
client.setTimout(timeout);
client.setUserAgent(userAgent);
client.setHost(this.getHost());
return client.GETbytes(this.toNormalform(false, false));
return client.GETbytes(this);
}
return null;

@ -426,7 +426,8 @@ public class Domains {
// the id=7 is used to flag local addresses
}
private static KeyList globalHosts;
private static KeyList globalHosts = null;
private static boolean noLocalCheck = false;
public static void init(File globalHostsnameCache) {
if (globalHostsnameCache == null) {
@ -437,6 +438,10 @@ public class Domains {
globalHosts = null;
}
}
public static void setNoLocalCheck(boolean v) {
noLocalCheck = v;
}
public static void close() {
if (globalHosts != null) try {globalHosts.close();} catch (IOException e) {}
@ -532,10 +537,12 @@ public class Domains {
ip = NAME_CACHE_HIT.get(host);
if (ip != null) {
//System.out.println("DNSLOOKUP-CACHE-HIT(SYNC) " + host);
LOOKUP_SYNC.remove(host);
return ip;
}
if (NAME_CACHE_MISS.containsKey(host)) {
//System.out.println("DNSLOOKUP-CACHE-MISS(SYNC) " + host);
LOOKUP_SYNC.remove(host);
return null;
}
@ -563,14 +570,13 @@ public class Domains {
}
}
LOOKUP_SYNC.remove(host);
return ip;
}
}
private final static Pattern dotPattern = Pattern.compile("\\.");
private static final InetAddress parseInetAddress(String ip) {
public static final InetAddress parseInetAddress(String ip) {
if (ip == null || ip.length() < 8) return null;
if (ip.equals("0:0:0:0:0:0:0:1%0")) ip = "127.0.0.1";
final String[] ips = dotPattern.split(ip);
@ -776,7 +782,8 @@ public class Domains {
}
public static boolean isLocalhost(final String host) {
return ("127.0.0.1".equals(host) ||
return (noLocalCheck ||
"127.0.0.1".equals(host) ||
"localhost".equals(host) ||
host.startsWith("0:0:0:0:0:0:0:1")
);
@ -787,7 +794,8 @@ public class Domains {
}
private static boolean isLocal(final String host, boolean recursive) {
if (host == null || host.length() == 0) return true;
if (noLocalCheck || host == null || host.length() == 0) return true;
// FIXME IPv4 only
// check local ip addresses
@ -802,11 +810,13 @@ public class Domains {
// check dns lookup: may be a local address even if the domain name looks global
if (!recursive) return false;
final InetAddress a = dnsResolve(host);
boolean localp = a == null || a.isAnyLocalAddress() || a.isLinkLocalAddress() || a.isLoopbackAddress() || a.isSiteLocalAddress() || isLocal(a.getHostAddress(), false);
return localp;
return isLocal(a);
}
public static boolean isLocal(InetAddress a) {
boolean localp = noLocalCheck || a == null || a.isAnyLocalAddress() || a.isLinkLocalAddress() || a.isLoopbackAddress() || a.isSiteLocalAddress() || isLocal(a.getHostAddress(), false);
return localp;
}
public static void main(final String[] args) {
/*

@ -69,7 +69,7 @@ public class ResponseHeader extends HeaderFramework {
public Date lastModified() {
Date d = headerDate(LAST_MODIFIED);
if (d == null) return new Date(); else return d;
if (d == null) return date(); else return d;
}
public long age() {

@ -267,7 +267,10 @@ public class HTTPClient {
* @throws IOException
*/
public byte[] GETbytes(final String uri) throws IOException {
return GETbytes(uri, Long.MAX_VALUE);
return GETbytes(uri, Long.MAX_VALUE);
}
public byte[] GETbytes(final MultiProtocolURI url) throws IOException {
return GETbytes(url, Long.MAX_VALUE);
}
/**
@ -279,12 +282,15 @@ public class HTTPClient {
* @throws IOException
*/
public byte[] GETbytes(final String uri, long maxBytes) throws IOException {
final MultiProtocolURI url = new MultiProtocolURI(uri);
return GETbytes(new MultiProtocolURI(uri), maxBytes);
}
public byte[] GETbytes(final MultiProtocolURI url, long maxBytes) throws IOException {
boolean localhost = url.getHost().equals("localhost");
String urix = url.toNormalform(true, false, !localhost, false);
final HttpGet httpGet = new HttpGet(urix);
if (!localhost) setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
return getContentBytes(httpGet, maxBytes);
final HttpGet httpGet = new HttpGet(urix);
if (!localhost) setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service
return getContentBytes(httpGet, maxBytes);
}
/**

@ -85,7 +85,7 @@ public class HTTPConnector {
client.setHost(vhost);
byte[] b;
try {
b = client.POSTbytes(url.toNormalform(true, false, true, false), post, usegzip);
b = client.POSTbytes(url, url.getHost(), post, usegzip);
} finally {
client.finish();
}

@ -25,10 +25,20 @@
package net.yacy.cora.services.federated.solr;
import java.net.InetAddress;
import java.util.Collection;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.cora.document.MultiProtocolURI;
import org.apache.solr.common.SolrInputDocument;
public enum SolrScheme {
@ -37,21 +47,21 @@ public enum SolrScheme {
DublinCore;
public SolrInputDocument yacy2solr(String id, Document document) {
if (this == SolrCell) return yacy2solrSolrCell(id, document);
public SolrInputDocument yacy2solr(String id, ResponseHeader header, Document document) {
if (this == SolrCell) return yacy2solrSolrCell(id, header, document);
return null;
}
public static SolrInputDocument yacy2solrSolrCell(String id, Document yacydoc) {
public static SolrInputDocument yacy2solrSolrCell(String id, ResponseHeader header, Document yacydoc) {
// we user the SolrCell design as index scheme
SolrInputDocument solrdoc = new SolrInputDocument();
DigestURI digestURI = new DigestURI(yacydoc.dc_source());
solrdoc.addField("id", id);
solrdoc.addField("sku", digestURI.toNormalform(true, false), 3.0f);
InetAddress address = Domains.dnsResolve(digestURI.getHost());
if (address != null) solrdoc.addField("attr_ip", address.getHostAddress());
if (digestURI.getHost() != null) solrdoc.addField("attr_host", digestURI.getHost());
/*
*
private final MultiProtocolURI source; // the source url
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field
private StringBuilder title; // a document title, taken from title or h1 tag; shall appear as headline of search result
@ -73,14 +83,149 @@ public enum SolrScheme {
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
private Set<String> languages;
private boolean indexingDenied;
private float lon, lat;
*/
solrdoc.addField("title", yacydoc.dc_title());
solrdoc.addField("author", yacydoc.dc_creator());
solrdoc.addField("description", yacydoc.dc_description());
solrdoc.addField("content_type", yacydoc.dc_format());
solrdoc.addField("subject", yacydoc.dc_subject(' '));
solrdoc.addField("text", UTF8.String(yacydoc.getTextBytes()));
solrdoc.addField("last_modified", header.lastModified());
solrdoc.addField("keywords", yacydoc.dc_subject(' '));
String content = UTF8.String(yacydoc.getTextBytes());
solrdoc.addField("attr_text", content);
int contentwc = content.split(" ").length;
solrdoc.addField("wordcount_i", contentwc);
// path elements of link
String path = digestURI.getPath();
if (path != null) {
String[] paths = path.split("/");
if (paths.length > 0) solrdoc.addField("attr_paths", paths);
}
// list all links
Map<MultiProtocolURI, Properties> alllinks = yacydoc.getAnchors();
int c = 0;
String[] inboundlinks = new String[yacydoc.inboundLinkCount()];
solrdoc.addField("inboundlinkscount_i", inboundlinks.length);
for (MultiProtocolURI url: yacydoc.inboundLinks()) {
Properties p = alllinks.get(url);
String name = p.getProperty("name", "");
String rel = p.getProperty("rel", "");
inboundlinks[c++] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
}
solrdoc.addField("attr_inboundlinks", inboundlinks);
c = 0;
String[] outboundlinks = new String[yacydoc.outboundLinkCount()];
solrdoc.addField("outboundlinkscount_i", outboundlinks.length);
for (MultiProtocolURI url: yacydoc.outboundLinks()) {
Properties p = alllinks.get(url);
String name = p.getProperty("name", "");
String rel = p.getProperty("rel", "");
outboundlinks[c++] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
((rel.toLowerCase().equals("nofollow")) ? " rel=\"nofollow\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
}
solrdoc.addField("attr_outboundlinks", yacydoc.outboundLinks().toArray());
// charset
solrdoc.addField("attr_charset", yacydoc.getCharset());
// coordinates
if (yacydoc.lat() != 0.0f && yacydoc.lon() != 0.0f) {
solrdoc.addField("lon_coordinate", yacydoc.lon());
solrdoc.addField("lat_coordinate", yacydoc.lat());
}
solrdoc.addField("attr_httpstatus", "200");
Object parser = yacydoc.getParserObject();
if (parser instanceof ContentScraper) {
ContentScraper html = (ContentScraper) parser;
// header tags
int h = 0;
int f = 1;
for (int i = 1; i <= 6; i++) {
String[] hs = html.getHeadlines(i);
h = h | (hs.length > 0 ? f : 0);
f = f * 2;
solrdoc.addField("attr_h" + i, hs);
}
solrdoc.addField("htags_i", h);
// meta tags
Map<String, String> metas = html.getMetas();
String robots = metas.get("robots");
if (robots != null) solrdoc.addField("attr_meta_robots", robots);
String generator = metas.get("generator");
if (generator != null) solrdoc.addField("attr_meta_generator", generator);
// bold, italic
String[] bold = html.getBold();
if (bold.length > 0) solrdoc.addField("attr_bold", bold);
String[] italic = html.getItalic();
if (bold.length > 0) solrdoc.addField("attr_italic", italic);
String[] li = html.getLi();
solrdoc.addField("licount_i", li.length);
if (li.length > 0) solrdoc.addField("attr_li", li);
// images
Collection<ImageEntry> imagesc = html.getImages().values();
String[] images = new String[imagesc.size()];
c = 0;
for (ImageEntry ie: imagesc) images[c++] = ie.toString();
solrdoc.addField("imagescount_i", images.length);
if (images.length > 0) solrdoc.addField("attr_images", images);
// style sheets
Map<MultiProtocolURI, String> csss = html.getCSS();
String[] css = new String[csss.size()];
c = 0;
for (Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
css[c++] =
"<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
" href=\""+ entry.getKey().toNormalform(false, false, false, false) + "\" />";
}
solrdoc.addField("csscount_i", css.length);
if (css.length > 0) solrdoc.addField("attr_css", css);
// Scripts
Set<MultiProtocolURI> scriptss = html.getScript();
String[] scripts = new String[scriptss.size()];
c = 0;
for (MultiProtocolURI url: scriptss) {
scripts[c++] = url.toNormalform(false, false, false, false);
}
solrdoc.addField("scriptscount_i", scripts.length);
if (scripts.length > 0) solrdoc.addField("attr_scripts", scripts);
// Frames
Set<MultiProtocolURI> framess = html.getFrames();
String[] frames = new String[framess.size()];
c = 0;
for (MultiProtocolURI entry: framess) {
frames[c++] = entry.toNormalform(false, false, false, false);
}
solrdoc.addField("framesscount_i", frames.length);
if (frames.length > 0) solrdoc.addField("attr_frames", frames);
// IFrames
Set<MultiProtocolURI> iframess = html.getFrames();
String[] iframes = new String[iframess.size()];
c = 0;
for (MultiProtocolURI entry: iframess) {
iframes[c++] = entry.toNormalform(false, false, false, false);
}
solrdoc.addField("iframesscount_i", iframes.length);
if (iframes.length > 0) solrdoc.addField("attr_iframes", iframes);
// flash embedded
solrdoc.addField("flash_b", html.containsFlash());
}
return solrdoc;
}
@ -88,11 +233,7 @@ public enum SolrScheme {
/*
* standard solr scheme
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
<field name="name" type="textgen" indexed="true" stored="true"/>
<field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
<field name="manu" type="textgen" indexed="true" stored="true" omitNorms="true"/>
<field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
@ -100,7 +241,6 @@ public enum SolrScheme {
<field name="weight" type="float" indexed="true" stored="true"/>
<field name="price" type="float" indexed="true" stored="true"/>
<field name="popularity" type="int" indexed="true" stored="true" />
<field name="inStock" type="boolean" indexed="true" stored="true" />
<!-- Common metadata fields, named specifically to match up with
SolrCell metadata when parsing rich documents such as Word, PDF.
@ -118,13 +258,5 @@ public enum SolrScheme {
<field name="last_modified" type="date" indexed="true" stored="true"/>
<field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. -->
<field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
*/
}

@ -41,6 +41,7 @@ import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.logging.Log;
@ -187,12 +188,12 @@ public class SolrSingleConnector {
}
*/
public void add(String id, Document doc) throws IOException {
add(id, doc, this.scheme);
public void add(String id, ResponseHeader header, Document doc) throws IOException {
add(id, header, doc, this.scheme);
}
public void add(String id, Document doc, SolrScheme tempScheme) throws IOException {
SolrInputDocument solrdoc = tempScheme.yacy2solr(id, doc);
public void add(String id, ResponseHeader header, Document doc, SolrScheme tempScheme) throws IOException {
SolrInputDocument solrdoc = tempScheme.yacy2solr(id, header, doc);
int thisrrc = this.transmissionRoundRobinCounter;
int nextrrc = thisrrc++;
if (nextrrc >= transmissionQueueCount) nextrrc = 0;

@ -47,6 +47,7 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;
@ -72,33 +73,36 @@ public class Document {
private final List<String> sections; // if present: more titles/headlines appearing in the document
private final StringBuilder description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private final Map<MultiProtocolURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final Map<MultiProtocolURI, Properties> anchors; // all links embedded as clickeable entities (anchor tags)
private final Map<MultiProtocolURI, String> rss; // all embedded rss feeds
private final Map<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
private Map<MultiProtocolURI, String> hyperlinks, audiolinks, videolinks, applinks;
private Map<MultiProtocolURI, String> hyperlinks, audiolinks, videolinks, applinks, inboundlinks, outboundlinks;
private Map<String, String> emaillinks;
private MultiProtocolURI favicon;
private boolean resorted;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
private Set<String> languages;
private boolean indexingDenied;
private float lon, lat;
private Object parserObject; // the source object that was used to create the Document
public Document(final MultiProtocolURI location, final String mimeType, final String charset, final Set<String> languages,
public Document(final MultiProtocolURI location, final String mimeType, final String charset,
final Object parserObject,
final Set<String> languages,
final String[] keywords, final String title, final String author, final String publisher,
final String[] sections, final String abstrct,
final float lon, final float lat,
final Object text,
final Map<MultiProtocolURI, String> anchors,
final Map<MultiProtocolURI, Properties> anchors,
final Map<MultiProtocolURI, String> rss,
final Map<MultiProtocolURI, ImageEntry> images,
boolean indexingDenied) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
this.parserObject = parserObject;
this.keywords = (keywords == null) ? new LinkedList<String>() : Arrays.asList(keywords);
this.title = (title == null) ? new StringBuilder(0) : new StringBuilder(title);
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
@ -106,7 +110,7 @@ public class Document {
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
this.lon = lon;
this.lat = lat;
this.anchors = (anchors == null) ? new HashMap<MultiProtocolURI, String>(0) : anchors;
this.anchors = (anchors == null) ? new HashMap<MultiProtocolURI, Properties>(0) : anchors;
this.rss = (rss == null) ? new HashMap<MultiProtocolURI, String>(0) : rss;
this.images = (images == null) ? new HashMap<MultiProtocolURI, ImageEntry>() : images;
this.publisher = publisher;
@ -116,19 +120,15 @@ public class Document {
this.applinks = null;
this.emaillinks = null;
this.resorted = false;
this.inboundLinks = -1;
this.outboundLinks = -1;
this.inboundlinks = null;
this.outboundlinks = null;
this.languages = languages;
this.indexingDenied = indexingDenied;
this.text = text == null ? new ByteArrayOutputStream() : text;
}
public void setInboundLinks(int il) {
this.inboundLinks = il;
}
public void setOutboundLinks(int ol) {
this.outboundLinks = ol;
public Object getParserObject() {
return this.parserObject;
}
/**
@ -179,8 +179,8 @@ dc_rights
public String dc_creator() {
return (creator == null) ? "" : creator.toString();
}
public String dc_subject(final char separator) {
public String[] dc_subject() {
// sort out doubles and empty words
final TreeSet<String> hs = new TreeSet<String>();
String s;
@ -189,11 +189,18 @@ dc_rights
s = (this.keywords.get(i)).trim();
if (s.length() > 0) hs.add(s.toLowerCase());
}
if (hs.isEmpty()) return "";
String[] t = new String[hs.size()];
int i = 0;
for (String u: hs) t[i++] = u;
return t;
}
public String dc_subject(final char separator) {
String[] t = dc_subject();
if (t.length == 0) return "";
// generate a new list
final StringBuilder sb = new StringBuilder(this.keywords.size() * 6);
final Iterator<String> i = hs.iterator();
while (i.hasNext()) sb.append(i.next()).append(separator);
final StringBuilder sb = new StringBuilder(t.length * 8);
for (String s: t) sb.append(s).append(separator);
return sb.substring(0, sb.length() - 1);
}
@ -314,7 +321,7 @@ dc_rights
return this.keywords;
}
public Map<MultiProtocolURI, String> getAnchors() {
public Map<MultiProtocolURI, Properties> getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return anchors;
@ -371,72 +378,79 @@ dc_rights
return this.lat;
}
private synchronized void resortLinks() {
private void resortLinks() {
if (this.resorted) return;
// extract hyperlinks, medialinks and emaillinks from anchorlinks
MultiProtocolURI url;
String u;
int extpos, qpos;
String ext = null;
final Iterator<Map.Entry<MultiProtocolURI, String>> i = anchors.entrySet().iterator();
hyperlinks = new HashMap<MultiProtocolURI, String>();
videolinks = new HashMap<MultiProtocolURI, String>();
audiolinks = new HashMap<MultiProtocolURI, String>();
applinks = new HashMap<MultiProtocolURI, String>();
emaillinks = new HashMap<String, String>();
final Map<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry<MultiProtocolURI, String> entry;
while (i.hasNext()) {
entry = i.next();
url = entry.getKey();
if (url == null) continue;
u = url.toNormalform(true, false);
if (u.startsWith("mailto:")) {
emaillinks.put(u.substring(7), entry.getValue());
} else {
extpos = u.lastIndexOf('.');
if (extpos > 0) {
if (((qpos = u.indexOf('?')) >= 0) && (qpos > extpos)) {
ext = u.substring(extpos + 1, qpos).toLowerCase();
} else {
ext = u.substring(extpos + 1).toLowerCase();
}
if (Classification.isMediaExtension(ext)) {
// this is not a normal anchor, its a media link
if (Classification.isImageExtension(ext)) {
ContentScraper.addImage(collectedImages, new ImageEntry(url, entry.getValue(), -1, -1, -1));
synchronized (this) {
if (this.resorted) return;
// extract hyperlinks, medialinks and emaillinks from anchorlinks
MultiProtocolURI url;
String u;
int extpos, qpos;
String ext = null;
String thishost = this.source.getHost();
this.inboundlinks = new HashMap<MultiProtocolURI, String>();
this.outboundlinks = new HashMap<MultiProtocolURI, String>();
this.hyperlinks = new HashMap<MultiProtocolURI, String>();
this.videolinks = new HashMap<MultiProtocolURI, String>();
this.audiolinks = new HashMap<MultiProtocolURI, String>();
this.applinks = new HashMap<MultiProtocolURI, String>();
this.emaillinks = new HashMap<String, String>();
final Map<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
for (Map.Entry<MultiProtocolURI, ImageEntry> entry: collectedImages.entrySet()) {
if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image");
}
for (Map.Entry<MultiProtocolURI, Properties> entry: anchors.entrySet()) {
url = entry.getKey();
if (url == null) continue;
if (url.getHost() != null && thishost != null && url.getHost().equals(thishost)) this.inboundlinks.put(url, "anchor"); else this.outboundlinks.put(url, "anchor");
u = url.toNormalform(true, false);
String name = entry.getValue().getProperty("name", "");
if (u.startsWith("mailto:")) {
emaillinks.put(u.substring(7), name);
} else {
extpos = u.lastIndexOf('.');
if (extpos > 0) {
if (((qpos = u.indexOf('?')) >= 0) && (qpos > extpos)) {
ext = u.substring(extpos + 1, qpos).toLowerCase();
} else {
ext = u.substring(extpos + 1).toLowerCase();
}
if (Classification.isMediaExtension(ext)) {
// this is not a normal anchor, its a media link
if (Classification.isImageExtension(ext)) {
ContentScraper.addImage(collectedImages, new ImageEntry(url, name, -1, -1, -1));
}
else if (Classification.isAudioExtension(ext)) audiolinks.put(url, name);
else if (Classification.isVideoExtension(ext)) videolinks.put(url, name);
else if (Classification.isApplicationExtension(ext)) applinks.put(url, name);
}
else if (Classification.isAudioExtension(ext)) audiolinks.put(url, entry.getValue());
else if (Classification.isVideoExtension(ext)) videolinks.put(url, entry.getValue());
else if (Classification.isApplicationExtension(ext)) applinks.put(url, entry.getValue());
}
// in any case we consider this as a link and let the parser decide if that link can be followed
hyperlinks.put(url, name);
}
// in any case we consider this as a link and let the parser decide if that link can be followed
hyperlinks.put(url, entry.getValue());
}
// add image links that we collected from the anchors to the image map
ContentScraper.addAllImages(images, collectedImages);
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(allReflinks(images.values()));
hyperlinks.putAll(allReflinks(audiolinks.keySet()));
hyperlinks.putAll(allReflinks(videolinks.keySet()));
hyperlinks.putAll(allReflinks(applinks.keySet()));
/*
hyperlinks.putAll(allSubpaths(hyperlinks.keySet()));
hyperlinks.putAll(allSubpaths(images.values()));
hyperlinks.putAll(allSubpaths(audiolinks.keySet()));
hyperlinks.putAll(allSubpaths(videolinks.keySet()));
hyperlinks.putAll(allSubpaths(applinks.keySet()));
*/
// don't do this again
this.resorted = true;
}
// add image links that we collected from the anchors to the image map
ContentScraper.addAllImages(images, collectedImages);
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(allReflinks(images.values()));
hyperlinks.putAll(allReflinks(audiolinks.keySet()));
hyperlinks.putAll(allReflinks(videolinks.keySet()));
hyperlinks.putAll(allReflinks(applinks.keySet()));
/*
hyperlinks.putAll(allSubpaths(hyperlinks.keySet()));
hyperlinks.putAll(allSubpaths(images.values()));
hyperlinks.putAll(allSubpaths(audiolinks.keySet()));
hyperlinks.putAll(allSubpaths(videolinks.keySet()));
hyperlinks.putAll(allSubpaths(applinks.keySet()));
*/
// don't do this again
this.resorted = true;
}
public static Map<MultiProtocolURI, String> allSubpaths(final Collection<?> links) {
@ -573,12 +587,24 @@ dc_rights
this.favicon = faviconURL;
}
public int inboundLinks() {
return (this.inboundLinks < 0) ? 0 : this.inboundLinks;
public int inboundLinkCount() {
if (this.inboundlinks == null) resortLinks();
return (this.inboundlinks == null) ? 0 : this.inboundlinks.size();
}
public int outboundLinkCount() {
if (this.outboundlinks == null) resortLinks();
return (this.outboundlinks == null) ? 0 : this.outboundlinks.size();
}
public int outboundLinks() {
return (this.outboundLinks < 0) ? 0 : this.outboundLinks;
public Set<MultiProtocolURI> inboundLinks() {
if (this.inboundlinks == null) resortLinks();
return (this.inboundlinks == null) ? null : this.inboundlinks.keySet();
}
public Set<MultiProtocolURI> outboundLinks() {
if (this.outboundlinks == null) resortLinks();
return (this.outboundlinks == null) ? null : this.outboundlinks.keySet();
}
public boolean indexingDenied() {
@ -608,7 +634,7 @@ dc_rights
String language = this.dc_language();
if (language != null && language.length() > 0) os.write("<dc:language>" + this.dc_language() + "</dc:language>\n");
os.write("<dc:date>" + ISO8601Formatter.FORMATTER.format(date) + "</dc:date>\n");
if (this.lon != 0.0f && this.lat != 0.0f) os.write("<geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat>\n");
if (this.lon != 0.0f && this.lat != 0.0f) os.write("<geo:Point><geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat></geo:Point>\n");
os.write("</record>\n");
}
@ -665,7 +691,7 @@ dc_rights
final StringBuilder description = new StringBuilder(80);
final LinkedList<String> sectionTitles = new LinkedList<String>();
final Map<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, Properties> anchors = new HashMap<MultiProtocolURI, Properties>();
final Map<MultiProtocolURI, String> rss = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
float lon = 0.0f, lat = 0.0f;
@ -716,6 +742,7 @@ dc_rights
globalMime,
null,
null,
null,
subjects.toString().split(" |,"),
title.toString(),
authors.toString(),

@ -267,6 +267,7 @@ public class DCEntry extends TreeMap<String, String> {
getIdentifier(true),
"text/html",
"UTF-8",
this,
languages,
getSubject(),
getTitle(),

@ -64,6 +64,7 @@ public class csvParser extends AbstractParser implements Parser {
location,
mimeType,
charset,
this,
null,
null,
concatRow(table.get(0)),

@ -88,6 +88,7 @@ public class docParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
this,
null,
null,
title,

@ -50,6 +50,7 @@ public class genericParser extends AbstractParser implements Parser {
location,
mimeType,
charset,
this,
null,
null,
location.getFileName().length() == 0 ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName()), // title

@ -70,6 +70,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
linkTags0.add("meta");
linkTags0.add("area");
linkTags0.add("link");
linkTags0.add("script");
linkTags0.add("embed"); //added by [MN]
linkTags0.add("param"); //added by [MN]
@ -78,17 +79,27 @@ public class ContentScraper extends AbstractScraper implements Scraper {
linkTags1.add("h2");
linkTags1.add("h3");
linkTags1.add("h4");
linkTags1.add("h5");
linkTags1.add("h6");
linkTags1.add("title");
linkTags1.add("b");
linkTags1.add("strong");
linkTags1.add("i");
linkTags1.add("li");
linkTags1.add("iframe");
//<iframe src="../../../index.htm" name="SELFHTML_in_a_box" width="90%" height="400">
}
// class variables: collectors for links
private Map<MultiProtocolURI, String> rss;
private Map<MultiProtocolURI, String> anchors;
private Map<MultiProtocolURI, Properties> anchors;
private Map<MultiProtocolURI, String> rss, css;
private Set<MultiProtocolURI> script, frames, iframes;
private Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final Map<String, String> metas;
private String title;
//private String headline;
private List<String>[] headlines;
private List<String> bold, italic, li;
private CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private float lon, lat;
@ -110,12 +121,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
super(linkTags0, linkTags1);
this.root = root;
this.rss = new HashMap<MultiProtocolURI, String>();
this.anchors = new HashMap<MultiProtocolURI, String>();
this.css = new HashMap<MultiProtocolURI, String>();
this.anchors = new HashMap<MultiProtocolURI, Properties>();
this.images = new HashMap<MultiProtocolURI, ImageEntry>();
this.frames = new HashSet<MultiProtocolURI>();
this.iframes = new HashSet<MultiProtocolURI>();
this.metas = new HashMap<String, String>();
this.script = new HashSet<MultiProtocolURI>();
this.title = "";
this.headlines = new ArrayList[4];
for (int i = 0; i < 4; i++) headlines[i] = new ArrayList<String>();
this.headlines = new ArrayList[6];
for (int i = 0; i < this.headlines.length; i++) headlines[i] = new ArrayList<String>();
this.bold = new ArrayList<String>();
this.italic = new ArrayList<String>();
this.li = new ArrayList<String>();
this.content = new CharBuffer(1024);
this.htmlFilterEventListeners = new EventListenerList();
this.lon = 0.0f;
@ -202,7 +220,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
s = p + 1;
try {
url = new MultiProtocolURI(u);
anchors.put(url, u);
anchors.put(url, new Properties());
continue;
} catch (MalformedURLException e) {}
}
@ -228,26 +246,24 @@ public class ContentScraper extends AbstractScraper implements Scraper {
try {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
if (width > 15 && height > 15) {
final float ratio = (float) Math.min(width, height) / Math.max(width, height);
if (ratio > 0.4) {
final MultiProtocolURI url = absolutePath(tagopts.getProperty("src", ""));
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
addImage(images, ie);
}
// i think that real pictures have witdth & height tags - thq
// } else if (width < 0 && height < 0) { // add or to ignore !?
// final yacyURL url = absolutePath(tagopts.getProperty("src", ""));
// final htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt", ""), width, height);
// addImage(images, ie);
}
//if (width > 15 && height > 15) {
final MultiProtocolURI url = absolutePath(tagopts.getProperty("src", ""));
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
addImage(images, ie);
//}
} catch (final NumberFormatException e) {}
} else if(tagname.equalsIgnoreCase("base")) {
try {
root = new MultiProtocolURI(tagopts.getProperty("href", ""));
} catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
frames.add(absolutePath(tagopts.getProperty("src", "")));
} else if (tagname.equalsIgnoreCase("iframe")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
iframes.add(absolutePath(tagopts.getProperty("src", "")));
} else if (tagname.equalsIgnoreCase("script")) {
script.add(absolutePath(tagopts.getProperty("src", "")));
} else if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", "");
if (name.length() > 0) {
@ -262,7 +278,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String areatitle = cleanLine(tagopts.getProperty("title",""));
//String alt = tagopts.getProperty("alt","");
final String href = tagopts.getProperty("href", "");
if (href.length() > 0) anchors.put(absolutePath(href), areatitle);
Properties p = new Properties(); p.put("name", areatitle);
if (href.length() > 0) anchors.put(absolutePath(href), p);
} else if (tagname.equalsIgnoreCase("link")) {
final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", ""));
@ -277,16 +294,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.favicon = newLink;
} else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
rss.put(newLink, linktitle);
} else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
css.put(newLink, rel);
} else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
anchors.put(newLink, linktitle);
Properties p = new Properties(); p.put("name", linktitle);
anchors.put(newLink, p);
}
}
} else if(tagname.equalsIgnoreCase("embed")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
} else if(tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", "");
if (name.equalsIgnoreCase("movie")) {
anchors.put(absolutePath(tagopts.getProperty("value", "")),name);
anchors.put(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */);
}
}
@ -308,7 +328,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
addImage(images, ie);
} else {
anchors.put(url, recursiveParse(text));
tagopts.put("name", recursiveParse(text));
anchors.put(url, tagopts);
}
}
}
@ -325,8 +346,26 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[3].add(h);
} else if ((tagname.equalsIgnoreCase("h5")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[4].add(h);
} else if ((tagname.equalsIgnoreCase("h6")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[5].add(h);
} else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
title = recursiveParse(text);
} else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) bold.add(h);
} else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) bold.add(h);
} else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) italic.add(h);
} else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) li.add(h);
}
// fire event
@ -389,8 +428,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
// otherwise take any headline
for (int i = 0; i < 4; i++) {
if (!headlines[i].isEmpty()) return headlines[i].get(0);
for (int i = 0; i < this.headlines.length; i++) {
if (!this.headlines[i].isEmpty()) return this.headlines[i].get(0);
}
// take description tag
@ -402,8 +441,31 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
public String[] getHeadlines(final int i) {
assert ((i >= 1) && (i <= 4));
return headlines[i - 1].toArray(new String[headlines.length]);
assert ((i >= 1) && (i <= this.headlines.length));
return this.headlines[i - 1].toArray(new String[this.headlines[i - 1].size()]);
}
public String[] getBold() {
return this.bold.toArray(new String[this.bold.size()]);
}
public String[] getItalic() {
return this.italic.toArray(new String[this.italic.size()]);
}
public String[] getLi() {
return this.li.toArray(new String[this.li.size()]);
}
public boolean containsFlash() {
this.anchors = new HashMap<MultiProtocolURI, Properties>();
String ext;
for (MultiProtocolURI url: this.anchors.keySet()) {
ext = url.getFileExtension();
if (ext == null) continue;
if (ext.equals("swf")) return true;
}
return false;
}
public byte[] getText() {
@ -415,7 +477,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
public Map<MultiProtocolURI, String> getAnchors() {
public Map<MultiProtocolURI, Properties> getAnchors() {
// returns a url (String) / name (String) relation
return anchors;
}
@ -425,6 +487,25 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return rss;
}
public Map<MultiProtocolURI, String> getCSS() {
// returns a url (String) / name (String) relation
return css;
}
public Set<MultiProtocolURI> getFrames() {
// returns a url (String) / name (String) relation
return frames;
}
public Set<MultiProtocolURI> getIFrames() {
// returns a url (String) / name (String) relation
return iframes;
}
public Set<MultiProtocolURI> getScript() {
return script;
}
/**
* get all images
* @return a map of <urlhash, ImageEntry>

@ -65,7 +65,11 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
@Override
public String toString() {
return "{" + url.toString() + ", " + alt + ", " + width + "/" + height + "}";
return "<img url=\"" + url.toNormalform(false, false, false, false) + "\"" +
(alt != null && alt.length() > 0 ? " alt=\"" + alt + "\"" : "") +
(width >= 0 ? " width=\"" + width + "\"" : "") +
(height >= 0 ? " height=\"" + height + "\"" : "") +
">";
}
@Override

@ -162,9 +162,15 @@ public class htmlParser extends AbstractParser implements Parser {
}
private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
final String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
final String[] sections = new String[
scraper.getHeadlines(1).length +
scraper.getHeadlines(2).length +
scraper.getHeadlines(3).length +
scraper.getHeadlines(4).length +
scraper.getHeadlines(5).length +
scraper.getHeadlines(6).length];
int p = 0;
for (int i = 1; i <= 4; i++) {
for (int i = 1; i <= 6; i++) {
for (final String headline : scraper.getHeadlines(i)) {
sections[p++] = headline;
}
@ -173,6 +179,7 @@ public class htmlParser extends AbstractParser implements Parser {
location,
mimeType,
charSet,
scraper,
scraper.getContentLanguages(),
scraper.getKeywords(),
scraper.getTitle(),

@ -36,6 +36,7 @@ import java.io.IOException;
import java.io.InputStream;import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Properties;
import java.util.Iterator;
import java.util.Set;
@ -180,7 +181,7 @@ public class genericImageParser extends AbstractParser implements Parser {
}
final HashSet<String> languages = new HashSet<String>();
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final HashMap<MultiProtocolURI, Properties> anchors = new HashMap<MultiProtocolURI, Properties>();
final HashMap<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
// add this image to the map of images
String infoString = ii.info.toString();
@ -192,6 +193,7 @@ public class genericImageParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
this,
languages,
keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords
title, // title

@ -88,6 +88,7 @@ public class mmParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
this,
null,
null,
rootElementText,

@ -162,6 +162,7 @@ public class odtParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
this,
languages,
docKeywords,
docLongTitle,

@ -147,6 +147,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
this,
languages,
docKeywords,
docLongTitle,

@ -165,6 +165,7 @@ public class pdfParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
this,
null,
docKeywords,
docTitle,

@ -86,6 +86,7 @@ public class pptParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
this,
null,
null,
title,

@ -102,6 +102,7 @@ public class psParser extends AbstractParser implements Parser {
location, // url
mimeType, // mime
"UTF-8", // charset
this,
null, // languages
null, // keywords
null, // title

@ -33,6 +33,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
@ -72,18 +73,21 @@ public class rssParser extends AbstractParser implements Parser {
final List<Document> docs = new ArrayList<Document>();
MultiProtocolURI uri;
Set<String> languages;
Map<MultiProtocolURI, String> anchors;
Map<MultiProtocolURI, Properties> anchors;
Document doc;
for (final Hit item: feed) try {
uri = new MultiProtocolURI(item.getLink());
languages = new HashSet<String>();
languages.add(item.getLanguage());
anchors = new HashMap<MultiProtocolURI, String>();
anchors.put(uri, item.getTitle());
anchors = new HashMap<MultiProtocolURI, Properties>();
Properties p = new Properties();
p.put("name", item.getTitle());
anchors.put(uri, p);
doc = new Document(
uri,
TextParser.mimeOf(url),
charset,
this,
languages,
item.getSubject(),
item.getTitle(),

@ -67,6 +67,7 @@ public class rtfParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
this,
null,
null,
((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).

@ -60,6 +60,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
location,
mimeType,
charset,
this,
null,
null,
null,

@ -82,6 +82,7 @@ public class sidAudioParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
this,
null,
null,
header.get("name"),

@ -87,6 +87,7 @@ public class sitemapParser extends AbstractParser implements Parser {
uri,
TextParser.mimeOf(url),
charset,
this,
null,
null,
"",

@ -31,6 +31,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
@ -80,7 +81,7 @@ public class swfParser extends AbstractParser implements Parser {
final String[] sections = null;
final String abstrct = null;
//TreeSet images = null;
final Map<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, Properties> anchors = new HashMap<MultiProtocolURI, Properties>();
int urls = 0;
int urlStart = -1;
int urlEnd = 0;
@ -97,7 +98,9 @@ public class swfParser extends AbstractParser implements Parser {
urlEnd = contents.indexOf(linebreak,urlStart);
url = contents.substring(urlStart,urlEnd);
urlnr = Integer.toString(++urls).toString();
anchors.put(new MultiProtocolURI(url), urlnr);
Properties p = new Properties();
p.put("name", urlnr);
anchors.put(new MultiProtocolURI(url), p);
contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
}
@ -106,6 +109,7 @@ public class swfParser extends AbstractParser implements Parser {
location, // url of the source document
mimeType, // the documents mime type
"UTF-8", // charset of the document text
this,
null,
null, //keywords
((contents.length() > 80)? contents.substring(0, 80):contents.trim()).

@ -98,6 +98,7 @@ public class torrentParser extends AbstractParser implements Parser {
location,
mimeType,
charset,
this,
null,
null,
title, // title

@ -34,6 +34,7 @@ import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Properties;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
@ -66,7 +67,7 @@ public class vcfParser extends AbstractParser implements Parser {
final StringBuilder parsedTitle = new StringBuilder();
final StringBuilder parsedDataText = new StringBuilder();
final HashMap<String, String> parsedData = new HashMap<String, String>();
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final HashMap<MultiProtocolURI, Properties> anchors = new HashMap<MultiProtocolURI, Properties>();
final LinkedList<String> parsedNames = new LinkedList<String>();
boolean useLastLine = false;
@ -174,7 +175,9 @@ public class vcfParser extends AbstractParser implements Parser {
} else if (key.toUpperCase().startsWith("URL")) {
try {
final MultiProtocolURI newURL = new MultiProtocolURI(value);
anchors.put(newURL, newURL.toString());
Properties p = new Properties();
p.put("name", newURL.toString());
anchors.put(newURL, p);
//parsedData.put(key,value);
} catch (final MalformedURLException ex) {/* ignore this */}
} else if (
@ -205,6 +208,7 @@ public class vcfParser extends AbstractParser implements Parser {
url, // url of the source document
mimeType, // the documents mime type
null, // charset
this,
null, // set of languages
null, // a list of extracted keywords
parsedTitle.toString(), // a long document title

@ -106,6 +106,7 @@ public class vsdParser extends AbstractParser implements Parser {
location, // url of the source document
mimeType, // the documents mime type
"UTF-8", // charset of the document text
this,
null, // language
keywords,
title,

@ -116,6 +116,7 @@ public class xlsParser extends AbstractParser implements Parser {
location,
mimeType,
"UTF-8",
this,
null,
null,
location.getFile(),

@ -216,7 +216,7 @@ public class ArrayStack implements BLOB {
public long mem() {
long m = 0;
for (blobItem b: this.blobs) m += b.blob.mem();
if (this.blobs != null) for (blobItem b: this.blobs) m += b.blob.mem();
return m;
}

@ -184,20 +184,20 @@ public final class HandleMap implements Iterable<Row.Entry> {
index.clear();
}
public final synchronized byte[] smallestKey() {
public final byte[] smallestKey() {
return index.smallestKey();
}
public final synchronized byte[] largestKey() {
public final byte[] largestKey() {
return index.largestKey();
}
public final synchronized boolean has(final byte[] key) {
public final boolean has(final byte[] key) {
assert (key != null);
return index.has(key);
}
public final synchronized long get(final byte[] key) {
public final long get(final byte[] key) {
assert (key != null);
final Row.Entry indexentry = index.get(key);
if (indexentry == null) return -1;
@ -212,10 +212,10 @@ public final class HandleMap implements Iterable<Row.Entry> {
* @throws IOException
* @throws RowSpaceExceededException
*/
public final synchronized long put(final byte[] key, final long l) throws RowSpaceExceededException {
public final long put(final byte[] key, final long l) throws RowSpaceExceededException {
assert l >= 0 : "l = " + l;
assert (key != null);
final Row.Entry newentry = index.row().newEntry();
final Row.Entry newentry = this.rowdef.newEntry();
newentry.setCol(0, key);
newentry.setCol(1, l);
final Row.Entry oldentry = index.replace(newentry);
@ -223,7 +223,7 @@ public final class HandleMap implements Iterable<Row.Entry> {
return oldentry.getColLong(1);
}
public final synchronized void putUnique(final byte[] key, final long l) throws RowSpaceExceededException {
public final void putUnique(final byte[] key, final long l) throws RowSpaceExceededException {
assert l >= 0 : "l = " + l;
assert (key != null);
final Row.Entry newentry = this.rowdef.newEntry();
@ -232,39 +232,41 @@ public final class HandleMap implements Iterable<Row.Entry> {
index.addUnique(newentry);
}
public final synchronized long add(final byte[] key, final long a) throws RowSpaceExceededException {
public final long add(final byte[] key, final long a) throws RowSpaceExceededException {
assert key != null;
assert a > 0; // it does not make sense to add 0. If this occurres, it is a performance issue
final Row.Entry indexentry = index.get(key);
if (indexentry == null) {
final Row.Entry newentry = this.rowdef.newEntry();
newentry.setCol(0, key);
newentry.setCol(1, a);
index.addUnique(newentry);
return 1;
synchronized (index) {
final Row.Entry indexentry = index.get(key);
if (indexentry == null) {
final Row.Entry newentry = this.rowdef.newEntry();
newentry.setCol(0, key);
newentry.setCol(1, a);
index.addUnique(newentry);
return 1;
}
final long i = indexentry.getColLong(1) + a;
indexentry.setCol(1, i);
index.put(indexentry);
return i;
}
final long i = indexentry.getColLong(1) + a;
indexentry.setCol(1, i);
index.put(indexentry);
return i;
}
public final synchronized long inc(final byte[] key) throws RowSpaceExceededException {
public final long inc(final byte[] key) throws RowSpaceExceededException {
return add(key, 1);
}
public final synchronized long dec(final byte[] key) throws RowSpaceExceededException {
public final long dec(final byte[] key) throws RowSpaceExceededException {
return add(key, -1);
}
public final synchronized ArrayList<long[]> removeDoubles() throws RowSpaceExceededException {
public final ArrayList<long[]> removeDoubles() throws RowSpaceExceededException {
final ArrayList<long[]> report = new ArrayList<long[]>();
long[] is;
int c;
long l;
final int initialSize = this.size();
for (final RowCollection rowset: index.removeDoubles()) {
ArrayList<RowCollection> rd = index.removeDoubles();
for (final RowCollection rowset: rd) {
is = new long[rowset.size()];
c = 0;
for (Row.Entry e: rowset) {
@ -277,7 +279,7 @@ public final class HandleMap implements Iterable<Row.Entry> {
return report;
}
public final synchronized ArrayList<byte[]> top(int count) {
public final ArrayList<byte[]> top(int count) {
List<Row.Entry> list0 = index.top(count);
ArrayList<byte[]> list = new ArrayList<byte[]>();
for (Row.Entry entry: list0) {
@ -288,41 +290,44 @@ public final class HandleMap implements Iterable<Row.Entry> {
public final synchronized long remove(final byte[] key) {
assert (key != null);
final boolean exist = index.has(key);
if (!exist) return -1;
final int s = index.size();
final long m = index.mem();
final Row.Entry indexentry = index.remove(key);
assert (indexentry != null);
assert index.size() < s : "s = " + s + ", index.size() = " + index.size();
assert index.mem() <= m : "m = " + m + ", index.mem() = " + index.mem();
final Row.Entry indexentry;
synchronized (index) {
final boolean exist = index.has(key);
if (!exist) return -1;
final int s = index.size();
final long m = index.mem();
indexentry = index.remove(key);
assert (indexentry != null);
assert index.size() < s : "s = " + s + ", index.size() = " + index.size();
assert index.mem() <= m : "m = " + m + ", index.mem() = " + index.mem();
}
if (indexentry == null) return -1;
return indexentry.getColLong(1);
}
public final synchronized long removeone() {
public final long removeone() {
final Row.Entry indexentry = index.removeOne();
if (indexentry == null) return -1;
return indexentry.getColLong(1);
}
public final synchronized int size() {
public final int size() {
return index.size();
}
public final synchronized boolean isEmpty() {
public final boolean isEmpty() {
return index.isEmpty();
}
public final synchronized CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) {
public final CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) {
return index.keys(up, firstKey);
}
public final synchronized CloneableIterator<Row.Entry> rows(final boolean up, final byte[] firstKey) {
public final CloneableIterator<Row.Entry> rows(final boolean up, final byte[] firstKey) {
return index.rows(up, firstKey);
}
public final synchronized void close() {
public final void close() {
index.close();
index = null;
}

Loading…
Cancel
Save