From 96c8119b500f2ffc90ebbb1da4001ea605c149dc Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 6 Jun 2012 12:57:42 +0200 Subject: [PATCH 1/7] added GeoLocation / GeoPoint classes which uses less memory than Location/Coordinates and has initializers with correct order of lat,lon coordinates --- htroot/yacysearch.java | 6 +- htroot/yacysearch_location.java | 6 +- .../document/geolocalization/Coordinates.java | 73 ------- .../{Location.java => GeoLocation.java} | 183 +++++++++--------- .../document/geolocalization/GeoPoint.java | 91 +++++++++ .../geolocalization/GeonamesLocalization.java | 14 +- .../geolocalization/Localization.java | 2 +- .../OpenGeoDBLocalization.java | 14 +- .../OverarchingLocalization.java | 4 +- 9 files changed, 207 insertions(+), 186 deletions(-) delete mode 100644 source/net/yacy/document/geolocalization/Coordinates.java rename source/net/yacy/document/geolocalization/{Location.java => GeoLocation.java} (75%) create mode 100644 source/net/yacy/document/geolocalization/GeoPoint.java diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 6c41fef75..182030eea 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -55,7 +55,7 @@ import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; -import net.yacy.document.geolocalization.Location; +import net.yacy.document.geolocalization.GeoLocation; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.Word; @@ -909,12 +909,12 @@ public class yacysearch { } // find geographic info - final SortedSet coordinates = LibraryProvider.geoLoc.find(originalquerystring, false); + final SortedSet coordinates = LibraryProvider.geoLoc.find(originalquerystring, false); if ( coordinates == null || coordinates.isEmpty() || startRecord > 0 ) { prop.put("geoinfo", "0"); } else { int i = 0; - for ( final Location c : coordinates ) { + for ( final GeoLocation c : coordinates ) { prop.put("geoinfo_loc_" + i + "_lon", Math.round(c.lon() * 10000.0f) / 10000.0f); prop.put("geoinfo_loc_" + i + "_lat", Math.round(c.lat() * 10000.0f) / 10000.0f); prop.put("geoinfo_loc_" + i + "_name", c.getName()); diff --git a/htroot/yacysearch_location.java b/htroot/yacysearch_location.java index 27bc0618f..68b3ac080 100644 --- a/htroot/yacysearch_location.java +++ b/htroot/yacysearch_location.java @@ -28,7 +28,7 @@ import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.opensearch.SRURSSConnector; import net.yacy.document.LibraryProvider; -import net.yacy.document.geolocalization.Location; +import net.yacy.document.geolocalization.GeoLocation; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import de.anomic.server.serverCore; @@ -67,11 +67,11 @@ public class yacysearch_location { int placemarkCounter = 0; if (query.length() > 0 && search_query) { - final Set locations = LibraryProvider.geoLoc.find(query, true); + final Set locations = LibraryProvider.geoLoc.find(query, true); for (final String qp: query.split(" ")) { locations.addAll(LibraryProvider.geoLoc.find(qp, true)); } - for (final Location location: locations) { + for (final GeoLocation location: locations) { // write for all locations a point to this message prop.put("kml_placemark_" + placemarkCounter + "_location", location.getName()); prop.put("kml_placemark_" + placemarkCounter + "_name", location.getName()); diff --git a/source/net/yacy/document/geolocalization/Coordinates.java b/source/net/yacy/document/geolocalization/Coordinates.java deleted file mode 100644 index 06fddbd49..000000000 --- a/source/net/yacy/document/geolocalization/Coordinates.java +++ /dev/null @@ -1,73 +0,0 @@ -/** - * Coordinates.java - * Copyright 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany - * first published 04.10.2009 on http://yacy.net - * - * This file is part of YaCy Content Integration - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.document.geolocalization; - -public class Coordinates { - - private static final double tenmeter = 90.0d / 1.0e6d; - - private final double lon, lat; - - public Coordinates(double lon, double lat) { - this.lon = lon; - this.lat = lat; - } - - public double lon() { - return this.lon; - } - - public double lat() { - return this.lat; - } - - private static final double bits30 = new Double(1L << 30).doubleValue(); // this is about one billion (US) - private static final double upscale = bits30 / 360.0; - - private static final int coord2int(double coord) { - return (int) ((180.0 - coord) * upscale); - } - - /** - * compute the hash code of a coordinate - * this produces identical hash codes for locations that are close to each other - */ - public int hashCode() { - return coord2int(this.lon) + (coord2int(this.lat) >> 15); - } - - /** - * equality test that is needed to use the class inside HashMap/HashSet - */ - public boolean equals(final Object o) { - if (!(o instanceof Coordinates)) return false; - Coordinates oo = (Coordinates) o; - if (this.lon == oo.lon && this.lat == oo.lat) return true; - // we access fuzzy values that are considered as equal if they are close to each other - return Math.abs(this.lon - oo.lon) < tenmeter && Math.abs(this.lat - oo.lat) < tenmeter; - } - - public String toString() { - return "[" + this.lon + "," + this.lat + "]"; - } -} diff --git a/source/net/yacy/document/geolocalization/Location.java b/source/net/yacy/document/geolocalization/GeoLocation.java similarity index 75% rename from source/net/yacy/document/geolocalization/Location.java rename to source/net/yacy/document/geolocalization/GeoLocation.java index 41a6e398c..94c91d73a 100644 --- a/source/net/yacy/document/geolocalization/Location.java +++ b/source/net/yacy/document/geolocalization/GeoLocation.java @@ -1,90 +1,93 @@ -/** - * Location.java - * Copyright 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany - * first published 08.10.2009 on http://yacy.net - * - * This file is part of YaCy Content Integration - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.document.geolocalization; - -import java.util.Comparator; - - -public class Location extends Coordinates implements Comparable, Comparator { - - private String name; - private int population; - - public Location(float lon, float lat) { - super(lon, lat); - this.name = null; - this.population = 0; - } - - public Location(float lon, float lat, String name) { - super(lon, lat); - this.name = name; - } - - public void setName(String name) { - this.name = name; - } - - public String getName() { - return this.name; - } - - public void setPopulation(int population) { - this.population = population; - } - - public int getPopulation() { - return this.population; - } - - public boolean equals(Object loc) { - if (!(loc instanceof Location)) return false; - if (this.name == null || ((Location) loc).name == null) return super.equals(loc); - return super.equals(loc) && this.name.toLowerCase().equals(((Location) loc).name.toLowerCase()); - } - - /** - * comparator that is needed to use the object inside TreeMap/TreeSet - * a Location is smaller than another if it has a _greater_ population - * this order is used to get sorted lists of locations where the first elements - * have the greatest population - */ - public int compareTo(Location o) { - if (this.equals(o)) return 0; - long s = (ph(this.getPopulation()) << 30) + this.hashCode(); - long t = (ph(o.getPopulation()) << 30) + o.hashCode(); - if (s > t) return -1; - if (s < t) return 1; - return 0; - } - - private long ph(int population) { - if (population > 10000) population -= 10000; - return (long) population; - } - - public int compare(Location o1, Location o2) { - return o1.compareTo(o2); - } - -} +/** + * GeoLocation + * Copyright 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * first published 08.10.2009 on http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document.geolocalization; + +import java.util.Comparator; + + +public class GeoLocation extends GeoPoint implements Comparable, Comparator { + + private String name; + private int population; + + public GeoLocation(double lat, double lon) { + super(lat, lon); + this.name = null; + this.population = 0; + } + + public GeoLocation(double lat, double lon, String name) { + super(lat, lon); + this.name = name; + } + + public void setName(String name) { + this.name = name; + } + + public String getName() { + return this.name; + } + + public void setPopulation(int population) { + this.population = population; + } + + public int getPopulation() { + return this.population; + } + + @Override + public boolean equals(Object loc) { + if (!(loc instanceof GeoLocation)) return false; + if (this.name == null || ((GeoLocation) loc).name == null) return super.equals(loc); + return super.equals(loc) && this.name.toLowerCase().equals(((GeoLocation) loc).name.toLowerCase()); + } + + /** + * comparator that is needed to use the object inside TreeMap/TreeSet + * a Location is smaller than another if it has a _greater_ population + * this order is used to get sorted lists of locations where the first elements + * have the greatest population + */ + @Override + public int compareTo(GeoLocation o) { + if (this.equals(o)) return 0; + long s = (ph(this.getPopulation()) << 30) + this.hashCode(); + long t = (ph(o.getPopulation()) << 30) + o.hashCode(); + if (s > t) return -1; + if (s < t) return 1; + return 0; + } + + private long ph(int population) { + if (population > 10000) population -= 10000; + return population; + } + + @Override + public int compare(GeoLocation o1, GeoLocation o2) { + return o1.compareTo(o2); + } + +} diff --git a/source/net/yacy/document/geolocalization/GeoPoint.java b/source/net/yacy/document/geolocalization/GeoPoint.java new file mode 100644 index 000000000..c948c44d1 --- /dev/null +++ b/source/net/yacy/document/geolocalization/GeoPoint.java @@ -0,0 +1,91 @@ +/** + * GeoPoint + * Copyright 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * first published 08.10.2009 on http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document.geolocalization; + + +public class GeoPoint { + + public static final double meter = 90.0d / 1.0e7d; // this is actually the definition of 'meter': 10 million meter shall be the distance from the equator to the pole + + private final long latlon; // using one variable for the coordinate pair saves some space + + public GeoPoint(double lat, double lon) { + this.latlon = (((long) coord2int(lat)) << 32) | (coord2int(lon)); + } + + + public GeoPoint(int lat, int lon) { + this.latlon = (((long) coord2int(lat / 1e6d)) << 32) | (coord2int(lon / 1e6d)); + } + + public double lon() { + return int2coord((int) (this.latlon & (Integer.MAX_VALUE))); + } + + public double lat() { + return int2coord((int) (this.latlon >>> 32)); + } + + private static final double maxint = new Double(Integer.MAX_VALUE).doubleValue(); + private static final double upscale = maxint / 360.0; + + private static final int coord2int(double coord) { + return (int) ((coord + 180.0) * upscale); + } + + private static final double int2coord(int z) { + return (z / upscale) - 180.0; + } + + /** + * compute the hash code of a coordinate + * this produces identical hash codes for locations that are close to each other + */ + @Override + public int hashCode() { + return (int) ((this.latlon & Integer.MAX_VALUE) >> 1) + (int) (this.latlon >> 33); + } + + /** + * equality test that is needed to use the class inside HashMap/HashSet + */ + @Override + public boolean equals(final Object o) { + if (!(o instanceof GeoPoint)) return false; + GeoPoint oo = (GeoPoint) o; + return (this.latlon == oo.latlon); + } + + @Override + public String toString() { + return "[" + this.lat() + "," + this.lon() + "]"; + } + + public static void main(String[] args) { + double lat = 13.419444d; + double lon = 52.548611d; + GeoPoint c = new GeoPoint(lat, lon); + System.out.println(c.toString() + " #" + c.hashCode()); + System.out.println("error: lat: " + (Math.abs(c.lat() - lat) / meter) + " meter; lon: " + (Math.abs(c.lon() - lon) / meter) + " meter"); + } +} diff --git a/source/net/yacy/document/geolocalization/GeonamesLocalization.java b/source/net/yacy/document/geolocalization/GeonamesLocalization.java index f23ea4f54..2f6225069 100644 --- a/source/net/yacy/document/geolocalization/GeonamesLocalization.java +++ b/source/net/yacy/document/geolocalization/GeonamesLocalization.java @@ -69,7 +69,7 @@ public class GeonamesLocalization implements Localization modification date : date of last modification in yyyy-MM-dd format */ - private final Map id2loc; + private final Map id2loc; private final TreeMap> name2ids; private final File file; @@ -77,7 +77,7 @@ public class GeonamesLocalization implements Localization // this is a processing of the cities1000.zip file from http://download.geonames.org/export/dump/ this.file = file; - this.id2loc = new HashMap(); + this.id2loc = new HashMap(); this.name2ids = new TreeMap>(StringBuilderComparator.CASE_INSENSITIVE_ORDER); @@ -112,8 +112,8 @@ public class GeonamesLocalization implements Localization for ( final String s : fields[3].split(",") ) { locnames.add(new StringBuilder(s)); } - final Location c = - new Location(Float.parseFloat(fields[5]), Float.parseFloat(fields[4]), fields[1]); + final GeoLocation c = + new GeoLocation(Float.parseFloat(fields[4]), Float.parseFloat(fields[5]), fields[1]); c.setPopulation((int) Long.parseLong(fields[14])); this.id2loc.put(id, c); for ( final StringBuilder name : locnames ) { @@ -136,7 +136,7 @@ public class GeonamesLocalization implements Localization } @Override - public TreeSet find(final String anyname, final boolean locationexact) { + public TreeSet find(final String anyname, final boolean locationexact) { final Set r = new HashSet(); List c; final StringBuilder an = new StringBuilder(anyname); @@ -155,9 +155,9 @@ public class GeonamesLocalization implements Localization } } } - final TreeSet a = new TreeSet(); + final TreeSet a = new TreeSet(); for ( final Integer e : r ) { - final Location w = this.id2loc.get(e); + final GeoLocation w = this.id2loc.get(e); if ( w != null ) { a.add(w); } diff --git a/source/net/yacy/document/geolocalization/Localization.java b/source/net/yacy/document/geolocalization/Localization.java index 3008c4d70..4b7b87add 100644 --- a/source/net/yacy/document/geolocalization/Localization.java +++ b/source/net/yacy/document/geolocalization/Localization.java @@ -45,7 +45,7 @@ public interface Localization { * @param locationexact - if true, then only exact matched with the location are returned. if false also partially matching names * @return a set of locations, ordered by population (if this information is given) */ - public TreeSet find(String anyname, boolean locationexact); + public TreeSet find(String anyname, boolean locationexact); /** * produce a set of location names diff --git a/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java b/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java index f0f891e8e..e02858a0e 100644 --- a/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java +++ b/source/net/yacy/document/geolocalization/OpenGeoDBLocalization.java @@ -53,7 +53,7 @@ public class OpenGeoDBLocalization implements Localization { private final Map locTypeHash2locType; - private final Map id2loc; + private final Map id2loc; private final Map id2locTypeHash; private final TreeMap> name2ids; private final TreeMap> kfz2ids; @@ -65,7 +65,7 @@ public class OpenGeoDBLocalization implements Localization this.file = file; this.locTypeHash2locType = new HashMap(); - this.id2loc = new HashMap(); + this.id2loc = new HashMap(); this.id2locTypeHash = new HashMap(); this.name2ids = new TreeMap>(StringBuilderComparator.CASE_INSENSITIVE_ORDER); @@ -112,7 +112,7 @@ public class OpenGeoDBLocalization implements Localization lat = Float.parseFloat(v[2]); lon = Float.parseFloat(v[3]); } - this.id2loc.put(Integer.parseInt(v[0]), new Location(lon, lat)); + this.id2loc.put(Integer.parseInt(v[0]), new GeoLocation(lat, lon)); } if ( line.startsWith("geodb_textdata ") ) { line = line.substring(15 + 7); @@ -126,7 +126,7 @@ public class OpenGeoDBLocalization implements Localization } l.add(id); this.name2ids.put(new StringBuilder(h), l); - final Location loc = this.id2loc.get(id); + final GeoLocation loc = this.id2loc.get(id); if ( loc != null ) { loc.setName(h); } @@ -200,7 +200,7 @@ public class OpenGeoDBLocalization implements Localization * @return */ @Override - public TreeSet find(final String anyname, final boolean locationexact) { + public TreeSet find(final String anyname, final boolean locationexact) { final HashSet r = new HashSet(); List c; final StringBuilder an = new StringBuilder(anyname); @@ -231,9 +231,9 @@ public class OpenGeoDBLocalization implements Localization r.add(i); } } - final TreeSet a = new TreeSet(); + final TreeSet a = new TreeSet(); for ( final Integer e : r ) { - final Location w = this.id2loc.get(e); + final GeoLocation w = this.id2loc.get(e); if ( w != null ) { a.add(w); } diff --git a/source/net/yacy/document/geolocalization/OverarchingLocalization.java b/source/net/yacy/document/geolocalization/OverarchingLocalization.java index 93a8183ba..26fded0ba 100644 --- a/source/net/yacy/document/geolocalization/OverarchingLocalization.java +++ b/source/net/yacy/document/geolocalization/OverarchingLocalization.java @@ -76,8 +76,8 @@ public class OverarchingLocalization implements Localization { * @return a set of locations, ordered by population (if this information is given) */ @Override - public TreeSet find(final String anyname, final boolean locationexact) { - final TreeSet locations = new TreeSet(); + public TreeSet find(final String anyname, final boolean locationexact) { + final TreeSet locations = new TreeSet(); for (final Localization service: this.services.values()) { locations.addAll(service.find(anyname, locationexact)); } From 6bb07afcc31df99f0868b7bf455f52f940fed95a Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 6 Jun 2012 13:36:10 +0200 Subject: [PATCH 2/7] accept also files with other file prefix; used to read 'foreign' cache files --- source/net/yacy/kelondro/blob/ArrayStack.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/source/net/yacy/kelondro/blob/ArrayStack.java b/source/net/yacy/kelondro/blob/ArrayStack.java index 6437b93e7..e5823db7a 100644 --- a/source/net/yacy/kelondro/blob/ArrayStack.java +++ b/source/net/yacy/kelondro/blob/ArrayStack.java @@ -128,7 +128,7 @@ public class ArrayStack implements BLOB { Runtime.getRuntime().availableProcessors(), 100, TimeUnit.MILLISECONDS, new LinkedBlockingQueue(), - new NamePrefixThreadFactory(prefix)); + new NamePrefixThreadFactory(this.prefix)); // check existence of the heap directory if (heapLocation.exists()) { @@ -183,9 +183,9 @@ public class ArrayStack implements BLOB { File f; long maxtime = 0; for (final String file : files) { - if (file.length() >= 22 && file.startsWith(prefix) && file.endsWith(".blob")) { + if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) { try { - d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(prefix.length() + 1, prefix.length() + 18)); + d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18)); time = d.getTime(); if (time > maxtime) maxtime = time; } catch (final ParseException e) {continue;} @@ -194,9 +194,9 @@ public class ArrayStack implements BLOB { // open all blob files for (final String file : files) { - if (file.length() >= 22 && file.startsWith(prefix) && file.endsWith(".blob")) { + if (file.length() >= 22 && file.charAt(this.prefix.length()) == '.' && file.endsWith(".blob")) { try { - d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(prefix.length() + 1, prefix.length() + 18)); + d = my_SHORT_MILSEC_FORMATTER.parse(file.substring(this.prefix.length() + 1, this.prefix.length() + 18)); f = new File(heapLocation, file); time = d.getTime(); if (time == maxtime && !trimall) { From d0ec8018f5b6183c95f2ac3102be20f8a65a3056 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 6 Jun 2012 14:13:31 +0200 Subject: [PATCH 3/7] fixes for bad long computation --- htroot/PerformanceMemory_p.java | 12 ++-- htroot/PerformanceQueues_p.java | 84 ++++++++++++------------- htroot/ProxyIndexingMonitor_p.java | 13 ++-- htroot/YBRFetch_p.java | 2 +- source/net/yacy/search/Switchboard.java | 2 +- 5 files changed, 56 insertions(+), 57 deletions(-) diff --git a/htroot/PerformanceMemory_p.java b/htroot/PerformanceMemory_p.java index dccb39f51..4a1b5942f 100644 --- a/htroot/PerformanceMemory_p.java +++ b/htroot/PerformanceMemory_p.java @@ -124,7 +124,7 @@ public class PerformanceMemory_p { c++; } prop.put("EcoList", c); - prop.putNum("EcoIndexTotalMem", totalmem / (1024 * 1024d)); + prop.putNum("EcoIndexTotalMem", totalmem / (1024d * 1024d)); // write object cache table final Iterator> oi = RAMIndex.objects(); @@ -153,7 +153,7 @@ public class PerformanceMemory_p { c++; } prop.put("indexcache", c); - prop.putNum("indexcacheTotalMem", totalhitmem / (1024 * 1024d)); + prop.putNum("indexcacheTotalMem", totalhitmem / (1024d * 1024d)); // write object cache table i = Cache.filenames(); @@ -195,10 +195,10 @@ public class PerformanceMemory_p { c++; } prop.put("ObjectList", c); - prop.putNum("objectCacheStopGrow", Cache.getMemStopGrow() / (1024 * 1024d)); - prop.putNum("objectCacheStartShrink", Cache.getMemStartShrink() / (1024 * 1024d)); - prop.putNum("objectHitCacheTotalMem", totalhitmem / (1024 * 1024d)); - prop.putNum("objectMissCacheTotalMem", totalmissmem / (1024 * 1024d)); + prop.putNum("objectCacheStopGrow", Cache.getMemStopGrow() / (1024d * 1024d)); + prop.putNum("objectCacheStartShrink", Cache.getMemStartShrink() / (1024d * 1024d)); + prop.putNum("objectHitCacheTotalMem", totalhitmem / (1024d * 1024d)); + prop.putNum("objectMissCacheTotalMem", totalmissmem / (1024d * 1024d)); // other caching structures prop.putNum("namecacheHit.size", Domains.nameCacheHitSize()); diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index f8d2fe607..aa71f767b 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -55,13 +55,13 @@ public class PerformanceQueues_p { performanceProfiles.put("defaults/yacy.init", "default (crawl)"); performanceProfiles.put("defaults/performance_dht.profile", "prefer DHT"); } - + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); File defaultSettingsFile = new File(sb.getAppPath(), "defaults/yacy.init"); - + // get segment Segment indexSegment = null; if (post != null && post.containsKey("segment")) { @@ -73,7 +73,7 @@ public class PerformanceQueues_p { // take default segment indexSegment = sb.indexSegments.segment(Segments.Process.PUBLIC); } - + if(post != null) { if(post.containsKey("defaultFile")){ // TODO check file-path! @@ -108,10 +108,10 @@ public class PerformanceQueues_p { Iterator threads = sb.threadNames(); String threadName; BusyThread thread; - + final boolean xml = (header.get(HeaderFramework.CONNECTION_PROP_PATH)).endsWith(".xml"); prop.setLocalized(!xml); - + // calculate totals long blocktime_total = 0, sleeptime_total = 0, exectime_total = 0; while (threads.hasNext()) { @@ -120,11 +120,11 @@ public class PerformanceQueues_p { blocktime_total += thread.getBlockTime(); sleeptime_total += thread.getSleepTime(); exectime_total += thread.getExecTime(); - } + } if (blocktime_total == 0) blocktime_total = 1; if (sleeptime_total == 0) sleeptime_total = 1; if (exectime_total == 0) exectime_total = 1; - + // set templates for latest news from the threads long blocktime, sleeptime, exectime; long idlesleep, busysleep, memuse, memprereq; @@ -141,11 +141,11 @@ public class PerformanceQueues_p { sb.setConfig("performanceProfile", post.get("defaultFile", "defaults/yacy.init")); sb.setConfig("performanceSpeed", post.getInt("profileSpeed", 100)); } - + while (threads.hasNext()) { threadName = threads.next(); thread = sb.getThread(threadName); - + // set values to templates prop.put("table_" + c + "_threadname", threadName); @@ -159,7 +159,7 @@ public class PerformanceQueues_p { prop.putHTML("table_" + c + "_longdescr", thread.getLongDescription()); queuesize = thread.getJobCount(); prop.put("table_" + c + "_queuesize", (queuesize == Integer.MAX_VALUE) ? "unlimited" : Formatter.number(queuesize, !xml)); - + blocktime = thread.getBlockTime(); sleeptime = thread.getSleepTime(); exectime = thread.getExecTime(); @@ -180,7 +180,7 @@ public class PerformanceQueues_p { prop.putNum("table_" + c + "_sleeppercycle", ((idleCycles + busyCycles) == 0) ? -1 : sleeptime / (idleCycles + busyCycles)); prop.putNum("table_" + c + "_execpercycle", (busyCycles == 0) ? -1 : exectime / busyCycles); prop.putNum("table_" + c + "_memusepercycle", (busyCycles == 0) ? -1 : memuse / busyCycles / 1024); - + // load with old values idlesleep = sb.getConfigLong(threadName + "_idlesleep" , 1000); busysleep = sb.getConfigLong(threadName + "_busysleep", 100); @@ -189,13 +189,13 @@ public class PerformanceQueues_p { // load with new values idlesleep = post.getLong(threadName + "_idlesleep", idlesleep); busysleep = post.getLong(threadName + "_busysleep", busysleep); - memprereq = post.getLong(threadName + "_memprereq", memprereq) * 1024; + memprereq = post.getLong(threadName + "_memprereq", memprereq) * 1024l; if (memprereq == 0) memprereq = sb.getConfigLong(threadName + "_memprereq", 0); - + // check values to prevent short-cut loops if (idlesleep < 1000) idlesleep = 1000; if (threadName.equals("10_httpd")) { idlesleep = 0; busysleep = 0; memprereq = 0; } - + sb.setThreadPerformance(threadName, idlesleep, busysleep, memprereq); idlesleep = sb.getConfigLong(threadName + "_idlesleep", idlesleep); busysleep = sb.getConfigLong(threadName + "_busysleep", busysleep); @@ -228,7 +228,7 @@ public class PerformanceQueues_p { c++; } prop.put("table", c); - + // performance profiles c = 0; final String usedfile = sb.getConfig("performanceProfile", "defaults/yacy.init"); @@ -239,7 +239,7 @@ public class PerformanceQueues_p { c++; } prop.put("profile", c); - + c = 0; final int[] speedValues = {200,150,100,50,25,10}; final int usedspeed = sb.getConfigInt("performanceSpeed", 100); @@ -250,27 +250,27 @@ public class PerformanceQueues_p { c++; } prop.put("speed", c); - + if ((post != null) && (post.containsKey("cacheSizeSubmit"))) { final int wordCacheMaxCount = post.getInt("wordCacheMaxCount", 20000); sb.setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount)); indexSegment.termIndex().setBufferMaxWordCount(wordCacheMaxCount); } - + if ((post != null) && (post.containsKey("poolConfig"))) { - - /* - * configuring the crawler pool + + /* + * configuring the crawler pool */ // get the current crawler pool configuration int maxBusy = post.getInt("Crawler Pool_maxActive", 8); - + // storing the new values into configfile sb.setConfig(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX,maxBusy); //switchboard.setConfig("crawler.MinIdleThreads",minIdle); - - /* - * configuring the http pool + + /* + * configuring the http pool */ final WorkflowThread httpd = sb.getThread("10_httpd"); try { @@ -279,23 +279,23 @@ public class PerformanceQueues_p { maxBusy = 8; } - ((serverCore)httpd).setMaxSessionCount(maxBusy); - + ((serverCore)httpd).setMaxSessionCount(maxBusy); + // storing the new values into configfile sb.setConfig("httpdMaxBusySessions",maxBusy); - } - + } + if ((post != null) && (post.containsKey("PrioritySubmit"))) { sb.setConfig("javastart_priority",post.get("YaCyPriority","0")); } - + if ((post != null) && (post.containsKey("onlineCautionSubmit"))) { sb.setConfig(SwitchboardConstants.PROXY_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseProxy", 30000))); sb.setConfig(SwitchboardConstants.LOCALSEACH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseLocalsearch", 30000))); sb.setConfig(SwitchboardConstants.REMOTESEARCH_ONLINE_CAUTION_DELAY, Integer.toString(post.getInt("crawlPauseRemotesearch", 30000))); } - + if ((post != null) && (post.containsKey("minimumDeltaSubmit"))) { final long minimumLocalDelta = post.getLong("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta()); final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta()); @@ -303,13 +303,13 @@ public class PerformanceQueues_p { sb.setConfig("minimumGlobalDelta", minimumGlobalDelta); sb.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); } - + // delta settings prop.put("minimumLocalDelta", sb.crawlQueues.noticeURL.getMinimumLocalDelta()); prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta()); - + // table cache settings - prop.putNum("urlCacheSize", indexSegment.urlMetadata().writeCacheSize()); + prop.putNum("urlCacheSize", indexSegment.urlMetadata().writeCacheSize()); prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize()); prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024); prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences()); @@ -323,30 +323,30 @@ public class PerformanceQueues_p { prop.putNum("crawlPauseProxyCurrent", (System.currentTimeMillis() - sb.proxyLastAccess) / 1000); prop.putNum("crawlPauseLocalsearchCurrent", (System.currentTimeMillis() - sb.localSearchLastAccess) / 1000); prop.putNum("crawlPauseRemotesearchCurrent", (System.currentTimeMillis() - sb.remoteSearchLastAccess) / 1000); - + // table thread pool settings prop.put("pool_0_name","Crawler Pool"); prop.put("pool_0_maxActive", sb.getConfigLong("crawler.MaxActiveThreads", 0)); prop.put("pool_0_numActive",sb.crawlQueues.workerSize()); - + final WorkflowThread httpd = sb.getThread("10_httpd"); prop.put("pool_1_name", "httpd Session Pool"); prop.put("pool_1_maxActive", ((serverCore)httpd).getMaxSessionCount()); prop.put("pool_1_numActive", ((serverCore)httpd).getJobCount()); - + prop.put("pool", "2"); - + final long curr_prio = sb.getConfigLong("javastart_priority",0); prop.put("priority_normal",(curr_prio == 0) ? "1" : "0"); prop.put("priority_below",(curr_prio == 10) ? "1" : "0"); prop.put("priority_low",(curr_prio == 20) ? "1" : "0"); - + // parse initialization memory settings final String Xmx = sb.getConfig("javastart_Xmx", "Xmx500m").substring(3); prop.put("Xmx", Xmx.substring(0, Xmx.length() - 1)); final String Xms = sb.getConfig("javastart_Xms", "Xms500m").substring(3); prop.put("Xms", Xms.substring(0, Xms.length() - 1)); - + final long diskFree = sb.getConfigLong(SwitchboardConstants.DISK_FREE, 3000L); final long diskFreeHardlimit = sb.getConfigLong(SwitchboardConstants.DISK_FREE_HARDLIMIT, 1000L); final long memoryAcceptDHT = sb.getConfigLong(SwitchboardConstants.MEMORY_ACCEPTDHT, 50000L); @@ -355,11 +355,11 @@ public class PerformanceQueues_p { prop.put("diskFreeHardlimit", diskFreeHardlimit); prop.put("memoryAcceptDHT", memoryAcceptDHT); if(observerTrigger) prop.put("observerTrigger", "1"); - + // return rewrite values for templates return prop; } - + private static String d(final String a, final String b) { return (a == null) ? b : a; } diff --git a/htroot/ProxyIndexingMonitor_p.java b/htroot/ProxyIndexingMonitor_p.java index 98553fd23..6bef3fa91 100644 --- a/htroot/ProxyIndexingMonitor_p.java +++ b/htroot/ProxyIndexingMonitor_p.java @@ -1,4 +1,4 @@ -// ProxyIndexingMonitor_p.java +// ProxyIndexingMonitor_p.java // --------------------------- // part of the AnomicHTTPD caching proxy // (C) by Michael Peter Christen; mc@yacy.net @@ -33,7 +33,6 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; - import de.anomic.crawler.Cache; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -66,7 +65,7 @@ public class ProxyIndexingMonitor_p { final boolean proxyYaCyOnly = post.containsKey("proxyYacyOnly"); env.setConfig(SwitchboardConstants.PROXY_YACY_ONLY, (proxyYaCyOnly) ? true : false); int newProxyPrefetchDepth = post.getInt("proxyPrefetchDepth", 0); - if (newProxyPrefetchDepth < 0) newProxyPrefetchDepth = 0; + if (newProxyPrefetchDepth < 0) newProxyPrefetchDepth = 0; if (newProxyPrefetchDepth > 20) newProxyPrefetchDepth = 20; // self protection ? env.setConfig("proxyPrefetchDepth", Integer.toString(newProxyPrefetchDepth)); final boolean proxyStoreHTCache = post.containsKey("proxyStoreHTCache"); @@ -77,7 +76,7 @@ public class ProxyIndexingMonitor_p { env.setConfig("proxyIndexingLocalText", proxyIndexingLocalText ? true : false); final boolean proxyIndexingLocalMedia = post.containsKey("proxyIndexingLocalMedia"); env.setConfig("proxyIndexingLocalMedia", proxyIndexingLocalMedia ? true : false); - + // added proxyCache, proxyCacheSize - Borg-0300 // proxyCache - check and create the directory oldProxyCachePath = env.getConfig(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT); @@ -90,12 +89,12 @@ public class ProxyIndexingMonitor_p { final File cache = env.getDataPath(SwitchboardConstants.HTCACHE_PATH, oldProxyCachePath); if (!cache.isDirectory() && !cache.isFile()) cache.mkdirs(); - // proxyCacheSize + // proxyCacheSize oldProxyCacheSize = env.getConfigLong(SwitchboardConstants.PROXY_CACHE_SIZE, 64L); newProxyCacheSize = post.getLong(SwitchboardConstants.PROXY_CACHE_SIZE, 64L); if (newProxyCacheSize < 4) { newProxyCacheSize = 4; } env.setConfig(SwitchboardConstants.PROXY_CACHE_SIZE, newProxyCacheSize); - Cache.setMaxCacheSize(newProxyCacheSize * 1024 * 1024); + Cache.setMaxCacheSize(newProxyCacheSize * 1024L * 1024L); // implant these settings also into the crawling profile for the proxy if (sb.crawler.defaultProxyProfile == null) { @@ -108,7 +107,7 @@ public class ProxyIndexingMonitor_p { sb.crawler.defaultProxyProfile.put("indexText", proxyIndexingLocalText); sb.crawler.defaultProxyProfile.put("indexMedia", proxyIndexingLocalMedia); sb.crawler.putActive(sb.crawler.defaultProxyProfile.handle().getBytes(), sb.crawler.defaultProxyProfile); - + prop.put("info", "2");//new proxyPrefetchdepth prop.put("info_message", newProxyPrefetchDepth); prop.put("info_caching", proxyStoreHTCache ? "1" : "0"); diff --git a/htroot/YBRFetch_p.java b/htroot/YBRFetch_p.java index 4f97ea207..3dc7625b3 100644 --- a/htroot/YBRFetch_p.java +++ b/htroot/YBRFetch_p.java @@ -28,7 +28,7 @@ public class YBRFetch_p final servletProperties prop = new servletProperties(); final Switchboard sb = (Switchboard) env; - if ( post == null || !post.containsKey("ghrt4") || MemoryControl.available() < 1024 * 1024 * 1024 ) { + if ( post == null || !post.containsKey("ghrt4") || MemoryControl.available() < 1024L * 1024L * 1024L ) { return prop; } final File hostIndexFile = new File(sb.queuesRoot, "hostIndex.blob"); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 4f1957e9a..2b3bad5ab 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -556,7 +556,7 @@ public final class Switchboard extends serverSwitch getDataPath(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT); this.log.logInfo("HTCACHE Path = " + this.htCachePath.getAbsolutePath()); final long maxCacheSize = - 1024 * 1024 * Long.parseLong(getConfig(SwitchboardConstants.PROXY_CACHE_SIZE, "2")); // this is megabyte + 1024L * 1024L * Long.parseLong(getConfig(SwitchboardConstants.PROXY_CACHE_SIZE, "2")); // this is megabyte Cache.init(this.htCachePath, this.peers.mySeed().hash, maxCacheSize); // create the surrogates directories From 3dd83768250572a6f42f7857adb0ef607e9c0223 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 6 Jun 2012 14:15:24 +0200 Subject: [PATCH 4/7] added automatic cleaning of cache if metadata and file database size is not equal. It might happen that these data is different because one of that caches is cleaned after a while or when it is too big. The metadata is then not cleaned, but now wiped after a checkup process at every application start. This should cause a bit less memory usage. --- htroot/ConfigHTCache_p.java | 9 +- source/de/anomic/crawler/Cache.java | 85 ++++++++++++++----- source/net/yacy/kelondro/blob/Compressor.java | 34 +++++++- source/net/yacy/kelondro/blob/MapHeap.java | 32 ++++++- 4 files changed, 129 insertions(+), 31 deletions(-) diff --git a/htroot/ConfigHTCache_p.java b/htroot/ConfigHTCache_p.java index b76087172..25ef37163 100644 --- a/htroot/ConfigHTCache_p.java +++ b/htroot/ConfigHTCache_p.java @@ -1,4 +1,4 @@ -// ConfigHTCache_p.java +// ConfigHTCache_p.java // --------------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de @@ -32,7 +32,6 @@ import java.io.IOException; import net.yacy.cora.protocol.RequestHeader; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; - import de.anomic.crawler.Cache; import de.anomic.data.WorkTables; import de.anomic.server.serverObjects; @@ -59,12 +58,12 @@ public class ConfigHTCache_p { cache.mkdirs(); } - // proxyCacheSize + // proxyCacheSize final int newProxyCacheSize = Math.max(post.getInt("maxCacheSize", 64), 4); env.setConfig(SwitchboardConstants.PROXY_CACHE_SIZE, newProxyCacheSize); - Cache.setMaxCacheSize(newProxyCacheSize * 1024 * 1024); + Cache.setMaxCacheSize(newProxyCacheSize * 1024L * 1024L); } - + if (post != null && post.containsKey("deletecomplete")) { if ("on".equals(post.get("deleteCache", ""))) { Cache.clear(); diff --git a/source/de/anomic/crawler/Cache.java b/source/de/anomic/crawler/Cache.java index 01e5fcc2f..f1867a7fa 100644 --- a/source/de/anomic/crawler/Cache.java +++ b/source/de/anomic/crawler/Cache.java @@ -39,6 +39,7 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; +import java.util.concurrent.BlockingQueue; import net.yacy.cora.document.ASCII; import net.yacy.cora.protocol.ResponseHeader; @@ -47,6 +48,7 @@ import net.yacy.kelondro.blob.Compressor; import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; +import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; @@ -57,7 +59,7 @@ public final class Cache { private static final String RESPONSE_HEADER_DB_NAME = "responseHeader.heap"; private static final String FILE_DB_NAME = "file.array"; - private static Map> responseHeaderDB = null; + private static MapHeap responseHeaderDB = null; private static Compressor fileDB = null; private static ArrayStack fileDBunbuffered = null; @@ -84,6 +86,7 @@ public final class Cache { } catch (final IOException e) { Log.logException(e); } + // open the cache file try { fileDBunbuffered = new ArrayStack(new File(cachePath, FILE_DB_NAME), prefix, Base64Order.enhancedCoder, 12, 1024 * 1024 * 2, false); fileDBunbuffered.setMaxSize(maxCacheSize); @@ -91,6 +94,53 @@ public final class Cache { } catch (final IOException e) { Log.logException(e); } + Log.logInfo("Cache", "initialized cache database responseHeaderDB.size() = " + responseHeaderDB.size() + ", fileDB.size() = " + fileDB.size()); + + // clean up the responseHeaderDB which cannot be cleaned the same way as the cache files. + // We do this as a concurrent job only once after start-up silently + if (responseHeaderDB.size() != fileDB.size()) { + Log.logWarning("Cache", "file and metadata size is not equal, starting a cleanup thread..."); + Thread startupCleanup = new Thread() { + @Override + public void run() { + // enumerate the responseHeaderDB and find out all entries that are not inside the fileDBunbuffered + BlockingQueue q = responseHeaderDB.keyQueue(1000); + final HandleSet delkeys = new HandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 1); + Log.logInfo("Cache", "started cleanup thread to remove unused cache metadata"); + try { + byte[] k; + while (((k = q.take()) != MapHeap.POISON_QUEUE_ENTRY)) { + if (!fileDB.containsKey(k)) try { delkeys.put(k); } catch (RowSpaceExceededException e) { break; } + } + } catch (InterruptedException e) { + } finally { + // delete the collected keys from the metadata + Log.logInfo("Cache", "cleanup thread collected " + delkeys.size() + " unused metadata entries; now deleting them from the file..."); + for (byte[] k: delkeys) { + try { + responseHeaderDB.delete(k); + } catch (IOException e) { + } + } + } + + Log.logInfo("Cache", "running check to remove unused file cache data"); + delkeys.clear(); + for (byte[] k: fileDB) { + if (!responseHeaderDB.containsKey(k)) try { delkeys.put(k); } catch (RowSpaceExceededException e) { break; } + } + Log.logInfo("Cache", "cleanup thread collected " + delkeys.size() + " unused cache entries; now deleting them from the file..."); + for (byte[] k: delkeys) { + try { + fileDB.delete(k); + } catch (IOException e) { + } + } + Log.logInfo("Cache", "terminated cleanup thread; responseHeaderDB.size() = " + responseHeaderDB.size() + ", fileDB.size() = " + fileDB.size()); + } + }; + startupCleanup.start(); + } } /** @@ -131,9 +181,7 @@ public final class Cache { * close the databases */ public static void close() { - if (responseHeaderDB instanceof MapHeap) { - ((MapHeap) responseHeaderDB).close(); - } + responseHeaderDB.close(); fileDB.close(true); } @@ -156,12 +204,9 @@ public final class Cache { hm.putAll(responseHeader); hm.put("@@URL", url.toNormalform(true, false)); try { - if (responseHeaderDB instanceof MapHeap) { - ((MapHeap) responseHeaderDB).insert(url.hash(), hm); - } else { - responseHeaderDB.put(url.hash(), hm); - } + responseHeaderDB.insert(url.hash(), hm); } catch (final Exception e) { + fileDB.delete(url.hash()); throw new IOException("Cache.store: cannot write to headerDB: " + e.getMessage()); } if (log.isFine()) log.logFine("stored in cache: " + url.toNormalform(true, false)); @@ -184,11 +229,7 @@ public final class Cache { // if not both is there then we do a clean-up if (headerExists) try { log.logWarning("header but not content of urlhash " + ASCII.String(urlhash) + " in cache; cleaned up"); - if (responseHeaderDB instanceof MapHeap) { - ((MapHeap) responseHeaderDB).delete(urlhash); - } else { - responseHeaderDB.remove(urlhash); - } + responseHeaderDB.delete(urlhash); } catch (final IOException e) {} if (fileExists) try { //log.logWarning("content but not header of url " + url.toString() + " in cache; cleaned up"); @@ -209,8 +250,14 @@ public final class Cache { public static ResponseHeader getResponseHeader(final byte[] hash) { // loading data from database - Map hdb; - hdb = responseHeaderDB.get(hash); + Map hdb = null; + try { + hdb = responseHeaderDB.get(hash); + } catch (IOException e) { + return null; + } catch (RowSpaceExceededException e) { + return null; + } if (hdb == null) return null; return new ResponseHeader(null, hdb); @@ -251,11 +298,7 @@ public final class Cache { * @throws IOException */ public static void delete(final byte[] hash) throws IOException { - if (responseHeaderDB instanceof MapHeap) { - ((MapHeap) responseHeaderDB).delete(hash); - } else { - responseHeaderDB.remove(hash); - } + responseHeaderDB.delete(hash); fileDB.delete(hash); } } diff --git a/source/net/yacy/kelondro/blob/Compressor.java b/source/net/yacy/kelondro/blob/Compressor.java index 96c5bfc49..5b9837cdb 100644 --- a/source/net/yacy/kelondro/blob/Compressor.java +++ b/source/net/yacy/kelondro/blob/Compressor.java @@ -32,6 +32,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import java.util.zip.GZIPInputStream; @@ -45,7 +46,7 @@ import net.yacy.kelondro.util.ByteArray; import net.yacy.kelondro.util.MemoryControl; -public class Compressor implements BLOB { +public class Compressor implements BLOB, Iterable { static byte[] gzipMagic = {(byte) 'z', (byte) '|'}; // magic for gzip-encoded content static byte[] plainMagic = {(byte) 'p', (byte) '|'}; // magic for plain content (no encoding) @@ -61,18 +62,22 @@ public class Compressor implements BLOB { initBuffer(); } + @Override public long mem() { return this.backend.mem(); } + @Override public void trim() { this.backend.trim(); } + @Override public String name() { return this.backend.name(); } + @Override public synchronized void clear() throws IOException { initBuffer(); this.backend.clear(); @@ -83,10 +88,12 @@ public class Compressor implements BLOB { this.bufferlength = 0; } + @Override public ByteOrder ordering() { return this.backend.ordering(); } + @Override public synchronized void close(final boolean writeIDX) { // no more thread is running, flush all queues flushAll(); @@ -164,6 +171,7 @@ public class Compressor implements BLOB { } } + @Override public byte[] get(final byte[] key) throws IOException, RowSpaceExceededException { // depending on the source of the result, we additionally do entry compression // because if a document was read once, we think that it will not be retrieved another time again soon @@ -186,6 +194,7 @@ public class Compressor implements BLOB { return decompress(b); } + @Override public byte[] get(final Object key) { if (!(key instanceof byte[])) return null; try { @@ -198,16 +207,19 @@ public class Compressor implements BLOB { return null; } + @Override public boolean containsKey(final byte[] key) { synchronized (this) { return this.buffer.containsKey(key) || this.backend.containsKey(key); } } + @Override public int keylength() { return this.backend.keylength(); } + @Override public synchronized long length() { try { return this.backend.length() + this.bufferlength; @@ -217,6 +229,7 @@ public class Compressor implements BLOB { } } + @Override public long length(final byte[] key) throws IOException { synchronized (this) { byte[] b = this.buffer.get(key); @@ -238,6 +251,7 @@ public class Compressor implements BLOB { return 0; } + @Override public void insert(final byte[] key, final byte[] b) throws IOException { // first ensure that the files do not exist anywhere @@ -265,32 +279,47 @@ public class Compressor implements BLOB { if (MemoryControl.shortStatus()) flushAll(); } + @Override public synchronized void delete(final byte[] key) throws IOException { this.backend.delete(key); final long rx = removeFromQueues(key); if (rx > 0) this.bufferlength -= rx; } + @Override public synchronized int size() { return this.backend.size() + this.buffer.size(); } + @Override public synchronized boolean isEmpty() { if (!this.backend.isEmpty()) return false; if (!this.buffer.isEmpty()) return false; return true; } + @Override public synchronized CloneableIterator keys(final boolean up, final boolean rotating) throws IOException { flushAll(); return this.backend.keys(up, rotating); } + @Override public synchronized CloneableIterator keys(final boolean up, final byte[] firstKey) throws IOException { flushAll(); return this.backend.keys(up, firstKey); } + @Override + public Iterator iterator() { + flushAll(); + try { + return this.backend.keys(true, false); + } catch (IOException e) { + return null; + } + } + private boolean flushOne() { if (this.buffer.isEmpty()) return false; // depending on process case, write it to the file or compress it to the other queue @@ -312,6 +341,7 @@ public class Compressor implements BLOB { } } + @Override public int replace(final byte[] key, final Rewriter rewriter) throws IOException, RowSpaceExceededException { final byte[] b = get(key); if (b == null) return 0; @@ -323,6 +353,7 @@ public class Compressor implements BLOB { return reduction; } + @Override public int reduce(final byte[] key, final Reducer reducer) throws IOException, RowSpaceExceededException { final byte[] b = get(key); if (b == null) return 0; @@ -334,4 +365,5 @@ public class Compressor implements BLOB { return reduction; } + } diff --git a/source/net/yacy/kelondro/blob/MapHeap.java b/source/net/yacy/kelondro/blob/MapHeap.java index 5d5049f7f..6300a643d 100644 --- a/source/net/yacy/kelondro/blob/MapHeap.java +++ b/source/net/yacy/kelondro/blob/MapHeap.java @@ -39,6 +39,8 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.date.GenericFormatter; @@ -57,8 +59,8 @@ import net.yacy.kelondro.util.MemoryControl; public class MapHeap implements Map> { - private BLOB blob; - private ARC> cache; + private final BLOB blob; + private final ARC> cache; private final char fillchar; @@ -445,11 +447,10 @@ public class MapHeap implements Map> { * close the Map table */ public synchronized void close() { - this.cache = null; + this.cache.clear(); // close file if (this.blob != null) this.blob.close(true); - this.blob = null; } @Override @@ -516,6 +517,29 @@ public class MapHeap implements Map> { return set; } + public final static byte[] POISON_QUEUE_ENTRY = "POISON".getBytes(); + public BlockingQueue keyQueue(final int size) { + final ArrayBlockingQueue set = new ArrayBlockingQueue(size); + (new Thread() { + @Override + public void run() { + try { + final Iterator i = MapHeap.this.blob.keys(true, false); + while (i.hasNext()) + try { + set.put(i.next()); + } catch (InterruptedException e) { + break; + } + } catch (final IOException e) {} + try { + set.put(MapHeap.POISON_QUEUE_ENTRY); + } catch (InterruptedException e) { + } + }}).start(); + return set; + } + @Override public Collection> values() { // this method shall not be used because it is not appropriate for this kind of data From 8002fd2578a3258f7e64a402343b1af441bbd368 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 6 Jun 2012 14:17:42 +0200 Subject: [PATCH 5/7] use less cache space since a large cache would cause more memory usage in index files. --- defaults/yacy.init | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 81c1a1cae..6bb101e7c 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -245,8 +245,8 @@ sessionidNamesFile = defaults/sessionid.names proxyCache = DATA/HTCACHE # the maximum disc cache size for files in Cache in megabytes -# default: 32 Gigabyte -proxyCacheSize = 32768 +# default: 4 Gigabyte +proxyCacheSize = 4096 # a path to the surrogate input directory surrogates.in = DATA/SURROGATES/in From d7eb18cdf2abec26efac4d116284ae07dd797430 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 6 Jun 2012 14:27:18 +0200 Subject: [PATCH 6/7] accept also file names beginning with "file://" for crawl start from file. --- htroot/CrawlStartExpert_p.html | 2 +- htroot/Crawler_p.java | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index 11a82e02b..0cadd1db8 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -69,7 +69,7 @@ - : + : diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 75a28a323..b0a0c983a 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -154,8 +154,14 @@ public class Crawler_p { } // remove crawlingFileContent before we record the call - final String crawlingFileName = post.get("crawlingFile"); - final File crawlingFile = (crawlingFileName != null && crawlingFileName.length() > 0) ? new File(crawlingFileName) : null; + String crawlingFileName = post.get("crawlingFile"); + final File crawlingFile; + if (crawlingFileName == null || crawlingFileName.length() == 0) { + crawlingFile = null; + } else { + if (crawlingFileName.startsWith("file://")) crawlingFileName = crawlingFileName.substring(7); + crawlingFile = new File(crawlingFileName); + } if (crawlingFile != null && crawlingFile.exists()) { post.remove("crawlingFile$file"); } @@ -644,7 +650,7 @@ public class Crawler_p { prop.put("crawlProfilesShow_list", count); prop.put("crawlProfilesShow", count == 0 ? 0 : 1); - + // return rewrite properties return prop; } From 00f2df1120b7c2fa96f1e838525d1d3cbb917748 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 6 Jun 2012 18:23:18 +0200 Subject: [PATCH 7/7] a variety of possible memory leak fixes --- source/net/yacy/cora/protocol/Domains.java | 1 + .../sorting/WeakPriorityBlockingQueue.java | 71 ++++++++++--------- source/net/yacy/cora/storage/HashARC.java | 4 +- source/net/yacy/cora/storage/KeyList.java | 4 ++ source/net/yacy/kelondro/index/HandleMap.java | 5 +- .../kelondro/rwi/ReferenceContainerArray.java | 1 + source/net/yacy/kelondro/table/Table.java | 3 + .../net/yacy/search/query/SnippetProcess.java | 4 +- .../net/yacy/search/snippet/TextSnippet.java | 4 +- 9 files changed, 58 insertions(+), 39 deletions(-) diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index 94c4ee48e..0d8e28374 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -453,6 +453,7 @@ public class Domains { globalHosts = null; } else try { globalHosts = new KeyList(globalHostsnameCache); + Log.logInfo("Domains", "loaded globalHosts cache of hostnames, size = " + globalHosts.size()); } catch (final IOException e) { globalHosts = null; } diff --git a/source/net/yacy/cora/sorting/WeakPriorityBlockingQueue.java b/source/net/yacy/cora/sorting/WeakPriorityBlockingQueue.java index 879e6e5ed..a240c3238 100644 --- a/source/net/yacy/cora/sorting/WeakPriorityBlockingQueue.java +++ b/source/net/yacy/cora/sorting/WeakPriorityBlockingQueue.java @@ -12,12 +12,12 @@ * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. - * + * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . @@ -41,7 +41,7 @@ import java.util.concurrent.TimeUnit; */ public class WeakPriorityBlockingQueue { - + private final TreeSet> queue; // object within the stack, ordered using a TreeSet private final Semaphore enqueued; // semaphore for elements in the stack private final ArrayList> drained; // objects that had been on the stack but had been removed @@ -70,7 +70,7 @@ public class WeakPriorityBlockingQueue { this.queue.clear(); this.enqueued.drainPermits(); } - + /** * test if the queue is empty * @return true if the queue is empty, false if not @@ -78,7 +78,7 @@ public class WeakPriorityBlockingQueue { public boolean isEmpty() { return this.queue.isEmpty() & this.drained.isEmpty(); } - + /** * get the number of elements in the queue, waiting to be removed with take() or poll() * @return @@ -89,7 +89,7 @@ public class WeakPriorityBlockingQueue { /** - * get the number of elements that had been drained so far and are wainting + * get the number of elements that had been drained so far and are waiting * in a list to get enumerated with element() * @return */ @@ -103,9 +103,9 @@ public class WeakPriorityBlockingQueue { * @return */ public synchronized int sizeAvailable() { - return this.queue.size() + this.drained.size(); + return Math.min(this.maxsize, this.queue.size() + this.drained.size()); } - + /** * put a element on the stack using a order of the weight * elements that had been on the stack cannot be put in again, @@ -126,7 +126,7 @@ public class WeakPriorityBlockingQueue { } assert this.queue.size() >= this.enqueued.availablePermits() : "(put) queue.size() = " + this.queue.size() + ", enqueued.availablePermits() = " + this.enqueued.availablePermits(); } - + /** * return the element with the smallest weight and remove it from the stack * @return null if no element is on the queue or the head of the queue @@ -138,7 +138,7 @@ public class WeakPriorityBlockingQueue { return takeUnsafe(); } } - + /** * Retrieves and removes the head of this queue, waiting if necessary * up to the specified wait time if no elements are present on this queue. @@ -153,7 +153,7 @@ public class WeakPriorityBlockingQueue { return takeUnsafe(); } } - + /** * Retrieves and removes the head of this queue, waiting if no elements are present on this queue. * @return the head element from the queue @@ -165,17 +165,17 @@ public class WeakPriorityBlockingQueue { return takeUnsafe(); } } - + private Element takeUnsafe() { final Element element = this.queue.first(); assert element != null; this.queue.remove(element); - this.drained.add(element); + if (this.drained.size() < this.maxsize) this.drained.add(element); assert this.queue.size() >= this.enqueued.availablePermits() : "(take) queue.size() = " + this.queue.size() + ", enqueued.availablePermits() = " + this.enqueued.availablePermits(); return element; } - + /** * return the element with the smallest weight, but do not remove it * @return null if no element is on the queue or the head of the queue @@ -184,7 +184,7 @@ public class WeakPriorityBlockingQueue { if (this.queue.isEmpty()) return null; return this.queue.first(); } - + /** * all objects that have been returned by poll or take are stored in a back-up list * where they can be retrieved afterward. The elements from that list are stored in @@ -214,7 +214,7 @@ public class WeakPriorityBlockingQueue { return this.drained.get(position); } } - + /** * retrieve an element from the drained queue but wait until a timeout * until returning null when no element will be available within the time @@ -237,7 +237,7 @@ public class WeakPriorityBlockingQueue { if (position >= this.drained.size()) return null; // we still don't have that element return this.drained.get(position); } - + /** * return the specific amount of entries as they would be retrievable with element() * if count is < 0 then all elements are taken @@ -249,11 +249,11 @@ public class WeakPriorityBlockingQueue { if (count < 0) { return list(); } - if (count > sizeAvailable()) throw new RuntimeException("list(" + count + ") exceeded avaiable number of elements (" + sizeAvailable() + ")"); + if (count > sizeAvailable()) throw new RuntimeException("list(" + count + ") exceeded avaiable number of elements (" + sizeAvailable() + ")"); while (count > this.drained.size()) this.poll(); return this.drained; } - + /** * return all entries as they would be retrievable with element() * @return a list of all elements in the stack @@ -263,7 +263,7 @@ public class WeakPriorityBlockingQueue { while (!this.queue.isEmpty()) this.poll(); return this.drained; } - + /** * iterate over all elements available. All elements that are still in the queue are drained to recorded positions * @return an iterator over all drained positions. @@ -283,20 +283,23 @@ public class WeakPriorityBlockingQueue { @Override public String toString(); } - + protected abstract static class AbstractElement implements Element { public long weight; public E element; - + + @Override public long getWeight() { return this.weight; } - + + @Override public E getElement() { return this.element; } - + + @Override public boolean equals(Element o) { return this.element.equals(o.getElement()); } @@ -305,13 +308,13 @@ public class WeakPriorityBlockingQueue { public int hashCode() { return this.element.hashCode(); } - + @Override public String toString() { - return element.toString() + "/" + weight; + return this.element.toString() + "/" + this.weight; } } - + /** * natural ordering elements, can be used as container of objects in the priority queue * the elements with smallest ordering weights are first in the queue when elements are taken @@ -323,10 +326,12 @@ public class WeakPriorityBlockingQueue { this.weight = weight; } + @Override public int compare(NaturalElement o1, NaturalElement o2) { return o1.compareTo(o2); } - + + @Override public int compareTo(NaturalElement o) { if (this.element == o.getElement()) return 0; if (this.element.equals(o.getElement())) return 0; @@ -338,9 +343,9 @@ public class WeakPriorityBlockingQueue { if (o1h < o2h) return -1; return 0; } - + } - + /** * reverse ordering elements, can be used as container of objects in the priority queue * the elements with highest ordering weights are first in the queue when elements are taken @@ -352,10 +357,12 @@ public class WeakPriorityBlockingQueue { this.weight = weight; } + @Override public int compare(ReverseElement o1, ReverseElement o2) { return o1.compareTo(o2); } - + + @Override public int compareTo(ReverseElement o) { if (this.element == o.getElement()) return 0; if (this.element.equals(o.getElement())) return 0; @@ -368,7 +375,7 @@ public class WeakPriorityBlockingQueue { return 0; } } - + public static void main(String[] args) { final WeakPriorityBlockingQueue a = new WeakPriorityBlockingQueue(3); //final Element REVERSE_POISON = new ReverseElement("", Long.MIN_VALUE); diff --git a/source/net/yacy/cora/storage/HashARC.java b/source/net/yacy/cora/storage/HashARC.java index b67a95d33..3a9f54dd6 100644 --- a/source/net/yacy/cora/storage/HashARC.java +++ b/source/net/yacy/cora/storage/HashARC.java @@ -37,13 +37,13 @@ public final class HashARC extends SimpleARC implements Map, I public HashARC(final int cacheSize) { this.cacheSize = cacheSize / 2; - super.levelA = Collections.synchronizedMap(new LinkedHashMap(cacheSize, 0.1f, accessOrder) { + super.levelA = Collections.synchronizedMap(new LinkedHashMap(1, 0.1f, accessOrder) { private static final long serialVersionUID = 1L; @Override protected boolean removeEldestEntry(final Map.Entry eldest) { return size() > HashARC.this.cacheSize; } }); - this.levelB = Collections.synchronizedMap(new LinkedHashMap(cacheSize, 0.1f, accessOrder) { + this.levelB = Collections.synchronizedMap(new LinkedHashMap(1, 0.1f, accessOrder) { private static final long serialVersionUID = 1L; @Override protected boolean removeEldestEntry(final Map.Entry eldest) { return size() > HashARC.this.cacheSize; diff --git a/source/net/yacy/cora/storage/KeyList.java b/source/net/yacy/cora/storage/KeyList.java index 9fd0b6588..3697403ad 100644 --- a/source/net/yacy/cora/storage/KeyList.java +++ b/source/net/yacy/cora/storage/KeyList.java @@ -76,6 +76,10 @@ public class KeyList implements Iterable { } + public int size() { + return this.keys.size(); + } + public boolean contains(final String key) { return this.keys.containsKey(key.trim().toLowerCase()); } diff --git a/source/net/yacy/kelondro/index/HandleMap.java b/source/net/yacy/kelondro/index/HandleMap.java index 42590aaee..0747e415c 100644 --- a/source/net/yacy/kelondro/index/HandleMap.java +++ b/source/net/yacy/kelondro/index/HandleMap.java @@ -98,6 +98,7 @@ public final class HandleMap implements Iterable { is.close(); is = null; assert this.index.size() == file.length() / (keylength + idxbytes); + trim(); } public void trim() { @@ -415,6 +416,7 @@ public final class HandleMap implements Iterable { return this.result.get(); } + @Override public final HandleMap call() throws IOException { try { finishloop: while (true) { @@ -439,7 +441,8 @@ public final class HandleMap implements Iterable { } } - public Iterator iterator() { + @Override + public Iterator iterator() { return rows(true, null); } } diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainerArray.java b/source/net/yacy/kelondro/rwi/ReferenceContainerArray.java index 829db45b1..bda9669f3 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainerArray.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainerArray.java @@ -473,6 +473,7 @@ public final class ReferenceContainerArray { } } } + references.trim(); System.out.println("CELL REFERENCE COLLECTION finished"); return references; } diff --git a/source/net/yacy/kelondro/table/Table.java b/source/net/yacy/kelondro/table/Table.java index 3e0e94999..b2483be11 100644 --- a/source/net/yacy/kelondro/table/Table.java +++ b/source/net/yacy/kelondro/table/Table.java @@ -187,6 +187,7 @@ public class Table implements Index, Iterable { } } } + this.index.trim(); // open the file this.file = new BufferedRecords(new Records(tablefile, rowdef.objectsize), this.buffersize); @@ -594,6 +595,7 @@ public class Table implements Index, Iterable { * @throws IOException * @throws RowSpaceExceededException */ + @Override public boolean put(final Entry row) throws IOException, RowSpaceExceededException { assert row != null; if (this.file == null || row == null) return true; @@ -702,6 +704,7 @@ public class Table implements Index, Iterable { } } + @Override public boolean delete(final byte[] key) throws IOException { return remove(key) != null; } diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index 6a6188210..ed8151973 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -109,8 +109,8 @@ public class SnippetProcess { this.urlRetrievalAllTime = 0; this.snippetComputationAllTime = 0; - this.result = new WeakPriorityBlockingQueue(-1); // this is the result, enriched with snippets, ranked and ordered by ranking - this.images = new WeakPriorityBlockingQueue(-1); + this.result = new WeakPriorityBlockingQueue(Math.max(1000, 10 * query.itemsPerPage())); // this is the result, enriched with snippets, ranked and ordered by ranking + this.images = new WeakPriorityBlockingQueue(Math.max(1000, 10 * query.itemsPerPage())); // snippets do not need to match with the complete query hashes, // only with the query minus the stopwords which had not been used for the search diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 60133bed2..60da24613 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -206,10 +206,10 @@ public class TextSnippet implements Comparable, Comparator