performance hacks: more pre-allocated StringBuilder

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7790 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 87bd559c42
commit 7db208c992

@ -61,10 +61,9 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.ReferenceContainerArray;
import net.yacy.kelondro.util.MemoryControl;
import de.anomic.search.MetadataRepository;
import de.anomic.search.Segment;
import de.anomic.search.MetadataRepository.Export;
import de.anomic.search.Segment;
public class URLAnalysis {
@ -78,15 +77,15 @@ public class URLAnalysis {
static {
try {
poison = new DigestURI("http://poison.org/poison");
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
poison = null;
}
}
public static class splitter extends Thread {
private ArrayBlockingQueue<DigestURI> in;
private ConcurrentHashMap<String, Integer> out;
private final ArrayBlockingQueue<DigestURI> in;
private final ConcurrentHashMap<String, Integer> out;
public splitter(final ArrayBlockingQueue<DigestURI> in, final ConcurrentHashMap<String, Integer> out) {
this.in = in;
@ -100,15 +99,15 @@ public class URLAnalysis {
final Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_");
while (true) {
try {
url = in.take();
url = this.in.take();
if (url == poison) break;
update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\."));
update(p.matcher(url.getPath()).replaceAll("/").split("/"));
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
}
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
}
@ -117,8 +116,8 @@ public class URLAnalysis {
Integer c;
for (final String t: s) {
if (t.length() == 0) continue;
c = out.get(t);
out.put(t, (c == null) ? 1 : c.intValue() + 1);
c = this.out.get(t);
this.out.put(t, (c == null) ? 1 : c.intValue() + 1);
}
}
}
@ -165,7 +164,7 @@ public class URLAnalysis {
final File outfile = new File(analysis);
BufferedReader reader = null;
long time = System.currentTimeMillis();
long start = time;
final long start = time;
int count = 0;
System.out.println("start processing");
@ -178,11 +177,11 @@ public class URLAnalysis {
line = line.trim();
if (line.length() > 0) {
try {
DigestURI url = new DigestURI(line);
final DigestURI url = new DigestURI(line);
in.put(url);
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
continue;
}
}
@ -208,12 +207,12 @@ public class URLAnalysis {
System.out.println("stopping threads");
for (int i = 0, available = Runtime.getRuntime().availableProcessors() + 1; i < available; i++) try {
in.put(poison);
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
try {
spl.join();
} catch (InterruptedException e1) {
} catch (final InterruptedException e1) {
Log.logException(e1);
}
@ -252,7 +251,7 @@ public class URLAnalysis {
}
}
os.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
@ -280,9 +279,9 @@ public class URLAnalysis {
line = line.trim();
if (line.length() > 0) {
try {
DigestURI url = new DigestURI(line);
final DigestURI url = new DigestURI(line);
hosts.add(url.getHost());
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
continue;
}
}
@ -324,7 +323,7 @@ public class URLAnalysis {
// write hosts
System.out.println("start writing results");
File outfile = new File(trunk + ((gz) ? ".gz" : ""));
final File outfile = new File(trunk + ((gz) ? ".gz" : ""));
long time = System.currentTimeMillis();
try {
OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
@ -340,7 +339,7 @@ public class URLAnalysis {
}
}
os.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
@ -370,9 +369,9 @@ public class URLAnalysis {
line = line.trim();
if (line.length() > 0) {
try {
DigestURI url = new DigestURI(line);
final DigestURI url = new DigestURI(line);
urls.add(url.toNormalform(true, true));
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
continue;
}
}
@ -412,7 +411,7 @@ public class URLAnalysis {
idx.dump(new File(statisticPath));
System.out.println("INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath);
idx.close();
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
}
@ -454,7 +453,7 @@ public class URLAnalysis {
final Export e = mr.export(new File(export), ".*", hs, format, false);
try {
e.join();
} catch (InterruptedException e1) {
} catch (final InterruptedException e1) {
Log.logException(e1);
}
System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries");
@ -495,7 +494,7 @@ public class URLAnalysis {
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT/METADATA used.dump diffurlcol.dump
try {
diffurlcol(args[1], args[2], args[3]);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
} else if (args[0].equals("-export") && args.length >= 4) {
@ -503,10 +502,10 @@ public class URLAnalysis {
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump
// instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0;
final int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0;
try {
export(args[1], format, args[3], (args.length >= 5) ? args[4] : null);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
} else if (args[0].equals("-delete") && args.length >= 3) {
@ -516,7 +515,7 @@ public class URLAnalysis {
// instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
try {
delete(args[1], args[2]);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
} else {
@ -555,7 +554,7 @@ public class URLAnalysis {
}
private static final String num(final int i) {
StringBuffer s = new StringBuffer(Integer.toString(i));
final StringBuilder s = new StringBuilder(Integer.toString(i));
while (s.length() < 9) s.insert(0, "0");
return s.toString();
}

@ -32,13 +32,12 @@ import java.util.Iterator;
import java.util.regex.Pattern;
import net.yacy.kelondro.blob.Tables;
import de.anomic.data.WorkTables;
public class YMarkCrawlStart extends HashMap<String,String>{
private static final long serialVersionUID = 1L;
private WorkTables worktables;
private final WorkTables worktables;
public YMarkCrawlStart(final WorkTables worktables) {
this.worktables = worktables;
@ -46,13 +45,13 @@ public class YMarkCrawlStart extends HashMap<String,String>{
public YMarkCrawlStart(final WorkTables worktables, final String url) {
this.worktables = worktables;
this.clear();
this.load(url);
clear();
load(url);
}
public void load(String url) {
public void load(final String url) {
try {
final StringBuffer buffer = new StringBuffer(500);
final StringBuilder buffer = new StringBuilder(500);
buffer.append("^.*crawlingURL=\\Q");
buffer.append(url);
buffer.append("\\E?.*");
@ -78,12 +77,12 @@ public class YMarkCrawlStart extends HashMap<String,String>{
end = buffer.length()-1;
value = buffer.substring(start, end);
start = end+1;
this.put(key, value);
put(key, value);
}
break;
}
}
} catch (IOException e) {
} catch (final IOException e) {
// TODO Auto-generated catch block
}
}

@ -48,13 +48,13 @@ public class YMarkTables {
private String basename;
private TABLES(String b) {
private TABLES(final String b) {
this.basename = b;
}
public String basename() {
return this.basename;
}
public String tablename(String bmk_user) {
public String tablename(final String bmk_user) {
return bmk_user+this.basename;
}
}
@ -65,13 +65,13 @@ public class YMarkTables {
private String protocol;
private PROTOCOLS(String s) {
private PROTOCOLS(final String s) {
this.protocol = s;
}
public String protocol() {
return this.protocol;
}
public String protocol(String s) {
public String protocol(final String s) {
return this.protocol+s;
}
}
@ -181,7 +181,7 @@ public class YMarkTables {
public Iterator<Tables.Row> getBookmarksByFolder(final String bmk_user, final String folder) throws IOException {
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
final StringBuffer patternBuilder = new StringBuffer(BUFFER_LENGTH);
final StringBuilder patternBuilder = new StringBuilder(BUFFER_LENGTH);
patternBuilder.setLength(0);
patternBuilder.append(p1);
patternBuilder.append('(');
@ -196,7 +196,7 @@ public class YMarkTables {
public Iterator<Tables.Row> getBookmarksByTag(final String bmk_user, final String[] tagArray) throws IOException {
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
final StringBuffer patternBuilder = new StringBuffer(BUFFER_LENGTH);
final StringBuilder patternBuilder = new StringBuilder(BUFFER_LENGTH);
patternBuilder.setLength(0);
patternBuilder.append(p1);
patternBuilder.append(p5);
@ -215,7 +215,7 @@ public class YMarkTables {
}
public SortedSet<Row> orderBookmarksBy(final Iterator<Row> rowIterator, final String sortname, final String sortorder) {
TreeSet<Row> sortTree = new TreeSet<Tables.Row>(new TablesRowComparator(sortname));
final TreeSet<Row> sortTree = new TreeSet<Tables.Row>(new TablesRowComparator(sortname));
Row row;
while (rowIterator.hasNext()) {
row = rowIterator.next();
@ -233,7 +233,7 @@ public class YMarkTables {
final YMarkEntry bmk = new YMarkEntry(false);
bmk.put(YMarkEntry.BOOKMARK.URL.key(), url);
bmk.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkUtil.cleanTagsString(tagString));
this.addBookmark(bmk_user, bmk, merge, true);
addBookmark(bmk_user, bmk, merge, true);
}
}
@ -243,7 +243,7 @@ public class YMarkTables {
final YMarkEntry bmk = new YMarkEntry(false);
bmk.put(YMarkEntry.BOOKMARK.URL.key(), url);
bmk.put(YMarkEntry.BOOKMARK.FOLDERS.key(), folder);
this.addBookmark(bmk_user, bmk, true, true);
addBookmark(bmk_user, bmk, true, true);
}
}
@ -252,7 +252,7 @@ public class YMarkTables {
final YMarkEntry bmk = new YMarkEntry(false);
bmk.put(YMarkEntry.BOOKMARK.URL.key(), url);
bmk.put(YMarkEntry.BOOKMARK.DATE_VISITED.key(), (new YMarkDate()).toString());
this.addBookmark(bmk_user, bmk, true, true);
addBookmark(bmk_user, bmk, true, true);
}
@ -272,7 +272,7 @@ public class YMarkTables {
HashSet<String> oldSet;
HashSet<String> newSet;
for (YMarkEntry.BOOKMARK b : YMarkEntry.BOOKMARK.values()) {
for (final YMarkEntry.BOOKMARK b : YMarkEntry.BOOKMARK.values()) {
switch(b) {
case DATE_ADDED:
if(!bmk_row.containsKey(b.key()))

@ -1377,7 +1377,7 @@ public final class HTTPDFileHandler {
final Pattern p = Pattern.compile("(href=\"|src=\")([^\"]+)|(href='|src=')([^']+)|(url\\(')([^']+)|(url\\(\")([^\"]+)|(url\\()([^\\)]+)");
final Matcher m = p.matcher(sbuffer);
final StringBuffer result = new StringBuffer();
final StringBuffer result = new StringBuffer(80);
while (m.find()) {
String init = null;
if(m.group(1) != null) init = m.group(1);

@ -37,12 +37,13 @@ import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Scanner;
import net.yacy.cora.storage.ConcurrentScoreMap;
import net.yacy.cora.storage.ClusteredScoreMap;
import net.yacy.cora.storage.ConcurrentScoreMap;
import net.yacy.cora.storage.ScoreMap;
import net.yacy.cora.storage.WeakPriorityBlockingQueue;
import net.yacy.cora.storage.WeakPriorityBlockingQueue.ReverseElement;
@ -55,7 +56,6 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.EventTracker;
import de.anomic.yacy.graphics.ProfilingGraph;
public final class RankingProcess extends Thread {
@ -132,16 +132,16 @@ public final class RankingProcess extends Thread {
// sort the local containers and truncate it to a limited count,
// so following sortings together with the global results will be fast
try {
long timer = System.currentTimeMillis();
final long timer = System.currentTimeMillis();
final TermSearch<WordReference> search = this.query.getSegment().termIndex().query(
query.queryHashes,
query.excludeHashes,
this.query.queryHashes,
this.query.excludeHashes,
null,
Segment.wordReferenceFactory,
query.maxDistance);
this.query.maxDistance);
this.localSearchInclusion = search.inclusion();
final ReferenceContainer<WordReference> index = search.joined();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.JOIN, query.queryString, index.size(), System.currentTimeMillis() - timer), false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.JOIN, this.query.queryString, index.size(), System.currentTimeMillis() - timer), false);
if (index.isEmpty()) {
return;
}
@ -157,7 +157,7 @@ public final class RankingProcess extends Thread {
public void add(
final ReferenceContainer<WordReference> index,
final boolean local,
String resourceName,
final String resourceName,
final int fullResource,
final boolean finalizeAddAtEnd) {
// we collect the urlhashes and construct a list with urlEntry objects
@ -180,11 +180,11 @@ public final class RankingProcess extends Thread {
// normalize entries
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.NORMALIZING, resourceName, index.size(), System.currentTimeMillis() - timer), false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.NORMALIZING, resourceName, index.size(), System.currentTimeMillis() - timer), false);
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
// apply all constraints
try {
@ -197,7 +197,7 @@ public final class RankingProcess extends Thread {
// increase flag counts
for (int j = 0; j < 32; j++) {
if (iEntry.flags().get(j)) {flagcount[j]++;}
if (iEntry.flags().get(j)) {this.flagcount[j]++;}
}
// check constraints
@ -206,11 +206,11 @@ public final class RankingProcess extends Thread {
}
// check document domain
if (query.contentdom != ContentDomain.TEXT) {
if ((query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue; }
if ((query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue; }
if ((query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue; }
if ((query.contentdom == ContentDomain.APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) { continue; }
if (this.query.contentdom != ContentDomain.TEXT) {
if ((this.query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue; }
if ((this.query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue; }
if ((this.query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue; }
if ((this.query.contentdom == ContentDomain.APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) { continue; }
}
// check tld domain
@ -226,27 +226,27 @@ public final class RankingProcess extends Thread {
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
// check site constraints
String hosthash = iEntry.hosthash();
if (query.sitehash == null) {
final String hosthash = iEntry.hosthash();
if (this.query.sitehash == null) {
// no site constraint there; maybe collect host navigation information
if (nav_hosts && query.urlMask_isCatchall) {
if (nav_hosts && this.query.urlMask_isCatchall) {
this.hostNavigator.inc(hosthash);
this.hostResolver.put(hosthash, iEntry.urlhash());
}
} else {
if (!hosthash.equals(query.sitehash)) {
if (!hosthash.equals(this.query.sitehash)) {
// filter out all domains that do not match with the site constraint
continue;
}
}
// finally make a double-check and insert result to stack
if (urlhashes.add(iEntry.urlhash())) {
if (this.urlhashes.add(iEntry.urlhash())) {
rankingtryloop: while (true) {
try {
stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
break rankingtryloop;
} catch (ArithmeticException e) {
} catch (final ArithmeticException e) {
// this may happen if the concurrent normalizer changes values during cardinal computation
continue rankingtryloop;
}
@ -256,12 +256,12 @@ public final class RankingProcess extends Thread {
}
}
} catch (InterruptedException e) {} finally {
} catch (final InterruptedException e) {} finally {
if (finalizeAddAtEnd) this.addRunning = false;
}
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.PRESORT, resourceName, index.size(), System.currentTimeMillis() - timer), false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.PRESORT, resourceName, index.size(), System.currentTimeMillis() - timer), false);
}
/**
@ -281,18 +281,18 @@ public final class RankingProcess extends Thread {
}
private boolean testFlags(final WordReference ientry) {
if (query.constraint == null) return true;
if (this.query.constraint == null) return true;
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if (query.allofconstraint) {
if (this.query.allofconstraint) {
for (int i = 0; i < 32; i++) {
if ((query.constraint.get(i)) && (!ientry.flags().get(i))) return false;
if ((this.query.constraint.get(i)) && (!ientry.flags().get(i))) return false;
}
return true;
}
for (int i = 0; i < 32; i++) {
if ((query.constraint.get(i)) && (ientry.flags().get(i))) return true;
if ((this.query.constraint.get(i)) && (ientry.flags().get(i))) return true;
}
return false;
}
@ -300,7 +300,7 @@ public final class RankingProcess extends Thread {
protected Map<byte[], ReferenceContainer<WordReference>> searchContainerMap() {
// direct access to the result maps is needed for abstract generation
// this is only available if execQuery() was called before
return localSearchInclusion;
return this.localSearchInclusion;
}
private WeakPriorityBlockingQueue.Element<WordReferenceVars> takeRWI(final boolean skipDoubleDom, final long waitingtime) {
@ -313,14 +313,14 @@ public final class RankingProcess extends Thread {
try {
//System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue());
int loops = 0; // a loop counter to terminate the reading if all the results are from the same domain
long timeout = System.currentTimeMillis() + waitingtime;
while (((!feedingIsFinished() && this.addRunning) || stack.sizeQueue() > 0) &&
final long timeout = System.currentTimeMillis() + waitingtime;
while (((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) &&
(this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage)) {
if (waitingtime <= 0) {
rwi = stack.poll();
rwi = this.stack.poll();
} else timeoutloop:while (System.currentTimeMillis() < timeout) {
if (feedingIsFinished() && stack.sizeQueue() == 0) break timeoutloop;
rwi = stack.poll(50);
if (feedingIsFinished() && this.stack.sizeQueue() == 0) break timeoutloop;
rwi = this.stack.poll(50);
if (rwi != null) break timeoutloop;
}
if (rwi == null) break;
@ -335,7 +335,7 @@ public final class RankingProcess extends Thread {
m = this.doubleDomCache.get(hosthash);
if (m == null) {
// first appearance of dom. we create an entry to signal that one of that domain was already returned
m = new WeakPriorityBlockingQueue<WordReferenceVars>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
m = new WeakPriorityBlockingQueue<WordReferenceVars>((this.query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
this.doubleDomCache.put(hosthash, m);
return rwi;
}
@ -343,7 +343,7 @@ public final class RankingProcess extends Thread {
m.put(rwi);
}
}
} catch (InterruptedException e1) {}
} catch (final InterruptedException e1) {}
if (this.doubleDomCache.isEmpty()) return null;
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
@ -355,7 +355,7 @@ public final class RankingProcess extends Thread {
while (i.hasNext()) {
try {
m = i.next();
} catch (ConcurrentModificationException e) {
} catch (final ConcurrentModificationException e) {
Log.logException(e);
continue; // not the best solution...
}
@ -400,7 +400,7 @@ public final class RankingProcess extends Thread {
if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi);
if (page == null) {
misses.add(obrwi.getElement().urlhash());
this.misses.add(obrwi.getElement().urlhash());
continue;
}
@ -413,9 +413,9 @@ public final class RankingProcess extends Thread {
continue; // rare case where the url is corrupted
}
if (!query.urlMask_isCatchall) {
if (!this.query.urlMask_isCatchall) {
// check url mask
if (!metadata.matches(query.urlMask)) {
if (!metadata.matches(this.query.urlMask)) {
this.sortout++;
continue;
}
@ -439,18 +439,18 @@ public final class RankingProcess extends Thread {
final String pagetitle = metadata.dc_title().toLowerCase();
// check exclusion
if ((QueryParams.anymatch(pagetitle, query.excludeHashes)) ||
(QueryParams.anymatch(pageurl.toLowerCase(), query.excludeHashes)) ||
(QueryParams.anymatch(pageauthor.toLowerCase(), query.excludeHashes))) {
if ((QueryParams.anymatch(pagetitle, this.query.excludeHashes)) ||
(QueryParams.anymatch(pageurl.toLowerCase(), this.query.excludeHashes)) ||
(QueryParams.anymatch(pageauthor.toLowerCase(), this.query.excludeHashes))) {
this.sortout++;
continue;
}
// check index-of constraint
if ((query.constraint != null) &&
(query.constraint.get(Condenser.flag_cat_indexof)) &&
if ((this.query.constraint != null) &&
(this.query.constraint.get(Condenser.flag_cat_indexof)) &&
(!(pagetitle.startsWith("index of")))) {
final Iterator<byte[]> wi = query.queryHashes.iterator();
final Iterator<byte[]> wi = this.query.queryHashes.iterator();
while (wi.hasNext()) {
this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash());
}
@ -459,18 +459,18 @@ public final class RankingProcess extends Thread {
}
// check location constraint
if ((query.constraint != null) &&
(query.constraint.get(Condenser.flag_cat_haslocation)) &&
if ((this.query.constraint != null) &&
(this.query.constraint.get(Condenser.flag_cat_haslocation)) &&
(metadata.lat() == 0.0f || metadata.lon() == 0.0f)) {
this.sortout++;
continue;
}
// check content domain
if ((query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) ||
(query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) ||
(query.contentdom == ContentDomain.IMAGE && page.limage() == 0) ||
(query.contentdom == ContentDomain.APP && page.lapp() == 0)) {
if ((this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) ||
(this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) ||
(this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0) ||
(this.query.contentdom == ContentDomain.APP && page.lapp() == 0)) {
this.sortout++;
continue;
}
@ -479,7 +479,7 @@ public final class RankingProcess extends Thread {
// author navigation:
if (pageauthor != null && pageauthor.length() > 0) {
// add author to the author navigator
String authorhash = ASCII.String(Word.word2hash(pageauthor));
final String authorhash = ASCII.String(Word.word2hash(pageauthor));
// check if we already are filtering for authors
if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) {
@ -518,31 +518,31 @@ public final class RankingProcess extends Thread {
}
public int sizeQueue() {
int c = stack.sizeQueue();
for (WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
int c = this.stack.sizeQueue();
for (final WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
c += s.sizeQueue();
}
return c;
}
public int sizeAvailable() {
int c = stack.sizeAvailable();
for (WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
int c = this.stack.sizeAvailable();
for (final WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
c += s.sizeAvailable();
}
return c;
}
public boolean isEmpty() {
if (!stack.isEmpty()) return false;
for (WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
if (!this.stack.isEmpty()) return false;
for (final WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
if (!s.isEmpty()) return false;
}
return true;
}
public int[] flagCount() {
return flagcount;
return this.flagcount;
}
// "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."
@ -591,7 +591,7 @@ public final class RankingProcess extends Thread {
}
public ScoreMap<String> getHostNavigator() {
ScoreMap<String> result = new ConcurrentScoreMap<String>();
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return result;
final Iterator<String> domhashs = this.hostNavigator.keys(false);
@ -613,14 +613,14 @@ public final class RankingProcess extends Thread {
}
public static final Comparator<Map.Entry<String, Integer>> mecomp = new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
public int compare(final Map.Entry<String, Integer> o1, final Map.Entry<String, Integer> o2) {
if (o1.getValue().intValue() < o2.getValue().intValue()) return 1;
if (o2.getValue().intValue() < o1.getValue().intValue()) return -1;
return 0;
}
};
public ScoreMap<String> getTopicNavigator(int count) {
public ScoreMap<String> getTopicNavigator(final int count) {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
@ -645,23 +645,25 @@ public final class RankingProcess extends Thread {
counts.put(word, q);
}
}
if (max > min) for (Map.Entry<String, Float> ce: counts.entrySet()) {
if (max > min) for (final Map.Entry<String, Float> ce: counts.entrySet()) {
result.set(ce.getKey(), (int) (((double) count) * (ce.getValue() - min) / (max - min)));
}
return this.ref;
}
private final static Pattern lettermatch = Pattern.compile("[a-z]+");
public void addTopic(final String[] words) {
String word;
for (final String w : words) {
word = w.toLowerCase();
if (word.length() > 2 &&
"http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off".indexOf(word) < 0 &&
!query.queryHashes.has(Word.word2hash(word)) &&
word.matches("[a-z]+") &&
!this.query.queryHashes.has(Word.word2hash(word)) &&
lettermatch.matcher(word).matches() &&
!Switchboard.badwords.contains(word) &&
!Switchboard.stopwords.contains(word)) {
ref.inc(word);
this.ref.inc(word);
}
}
}

@ -28,6 +28,7 @@ package de.anomic.yacy.dht;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
@ -41,11 +42,9 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import de.anomic.search.Segment;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
import java.util.List;
public class Dispatcher {
@ -111,14 +110,14 @@ public class Dispatcher {
this.seeds = seeds;
this.log = new Log("INDEX-TRANSFER-DISPATCHER");
this.transmission = new Transmission(
log,
this.log,
segment,
seeds,
gzipBody,
timeout);
int concurrentSender = Math.min(32, Math.max(10, WorkflowProcessor.availableCPU));
indexingTransmissionProcessor = new WorkflowProcessor<Transmission.Chunk>(
final int concurrentSender = Math.min(32, Math.max(10, WorkflowProcessor.availableCPU));
this.indexingTransmissionProcessor = new WorkflowProcessor<Transmission.Chunk>(
"transferDocumentIndex",
"This is the RWI transmission process",
new String[]{"RWI/Cache/Collections"},
@ -152,7 +151,7 @@ public class Dispatcher {
final int maxtime) throws IOException {
// prefer file
ArrayList<ReferenceContainer<WordReference>> containers = selectContainers(hash, limitHash, maxContainerCount, maxReferenceCount, maxtime, false);
final ArrayList<ReferenceContainer<WordReference>> containers = selectContainers(hash, limitHash, maxContainerCount, maxReferenceCount, maxtime, false);
// if ram does not provide any result, take from file
//if (containers.isEmpty()) containers = selectContainers(hash, limitHash, maxContainerCount, maxtime, false);
@ -193,12 +192,12 @@ public class Dispatcher {
final ArrayList<ReferenceContainer<WordReference>> rc;
if (ram) {
// selection was only from ram, so we have to carefully remove only the selected entries
HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
Iterator<WordReference> it;
for (ReferenceContainer<WordReference> c: containers) {
for (final ReferenceContainer<WordReference> c: containers) {
urlHashes.clear();
it = c.entries();
while (it.hasNext()) try { urlHashes.put(it.next().urlhash()); } catch (RowSpaceExceededException e) { Log.logException(e); }
while (it.hasNext()) try { urlHashes.put(it.next().urlhash()); } catch (final RowSpaceExceededException e) { Log.logException(e); }
if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + ASCII.String(c.getTermHash()) + "'");
if (!urlHashes.isEmpty()) this.segment.termIndex().remove(c.getTermHash(), urlHashes);
}
@ -207,7 +206,7 @@ public class Dispatcher {
// selection was from whole index, so we can just delete the whole container
// but to avoid race conditions return the results from the deletes
rc = new ArrayList<ReferenceContainer<WordReference>>(containers.size());
for (ReferenceContainer<WordReference> c: containers) {
for (final ReferenceContainer<WordReference> c: containers) {
container = this.segment.termIndex().delete(c.getTermHash()); // be aware this might be null!
if (container != null && !container.isEmpty()) {
if (this.log.isFine()) this.log.logFine("selected " + container.size() + " urls for word '" + ASCII.String(c.getTermHash()) + "'");
@ -229,24 +228,24 @@ public class Dispatcher {
* @throws RowSpaceExceededException
*/
@SuppressWarnings("unchecked")
private List<ReferenceContainer<WordReference>>[] splitContainers(List<ReferenceContainer<WordReference>> containers) throws RowSpaceExceededException {
private List<ReferenceContainer<WordReference>>[] splitContainers(final List<ReferenceContainer<WordReference>> containers) throws RowSpaceExceededException {
// init the result vector
int partitionCount = this.seeds.scheme.verticalPartitions();
List<ReferenceContainer<WordReference>>[] partitions = (ArrayList<ReferenceContainer<WordReference>>[]) new ArrayList[partitionCount];
final int partitionCount = this.seeds.scheme.verticalPartitions();
final List<ReferenceContainer<WordReference>>[] partitions = new ArrayList[partitionCount];
for (int i = 0; i < partitions.length; i++) partitions[i] = new ArrayList<ReferenceContainer<WordReference>>();
// check all entries and split them to the partitions
ReferenceContainer<WordReference>[] partitionBuffer = new ReferenceContainer[partitionCount];
final ReferenceContainer<WordReference>[] partitionBuffer = new ReferenceContainer[partitionCount];
WordReference re;
for (ReferenceContainer<WordReference> container: containers) {
for (final ReferenceContainer<WordReference> container: containers) {
// init the new partitions
for (int j = 0; j < partitionBuffer.length; j++) {
partitionBuffer[j] = new ReferenceContainer<WordReference>(Segment.wordReferenceFactory, container.getTermHash(), container.size() / partitionCount);
}
// split the container
Iterator<WordReference> i = container.entries();
final Iterator<WordReference> i = container.entries();
while (i.hasNext()) {
re = i.next();
if (re == null) continue;
@ -272,7 +271,7 @@ public class Dispatcher {
* then no additional IO is necessary.
*/
private void enqueueContainersToCloud(final List<ReferenceContainer<WordReference>>[] containers) {
if (transmissionCloud == null) return;
if (this.transmissionCloud == null) return;
ReferenceContainer<WordReference> lastContainer;
byte[] primaryTarget;
ByteArray pTArray;
@ -286,13 +285,13 @@ public class Dispatcher {
// get or make a entry object
entry = this.transmissionCloud.get(pTArray); // if this is not null, the entry is extended here
List<yacySeed> targets = PeerSelection.getAcceptRemoteIndexSeedsList(
seeds,
final List<yacySeed> targets = PeerSelection.getAcceptRemoteIndexSeedsList(
this.seeds,
primaryTarget,
seeds.redundancy() * 3,
this.seeds.redundancy() * 3,
true);
this.log.logInfo("enqueueContainers: selected " + targets.size() + " targets for primary target key " + ASCII.String(primaryTarget) + "/" + vertical + " with " + containers[vertical].size() + " index containers.");
if (entry == null) entry = transmission.newChunk(primaryTarget, targets);
if (entry == null) entry = this.transmission.newChunk(primaryTarget, targets);
/*/ lookup targets
int sc = 1;
@ -305,10 +304,10 @@ public class Dispatcher {
}*/
// fill the entry with the containers
for (ReferenceContainer<WordReference> c: containers[vertical]) {
for (final ReferenceContainer<WordReference> c: containers[vertical]) {
try {
entry.add(c);
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
Log.logException(e);
break;
}
@ -330,7 +329,7 @@ public class Dispatcher {
List<ReferenceContainer<WordReference>> selectedContainerCache;
try {
selectedContainerCache = selectContainers(hash, limitHash, maxContainerCount, maxReferenceCount, maxtime);
} catch (IOException e) {
} catch (final IOException e) {
this.log.logSevere("selectContainersEnqueueToCloud: selectedContainer failed", e);
return false;
}
@ -344,7 +343,7 @@ public class Dispatcher {
List<ReferenceContainer<WordReference>>[] splitContainerCache;
try {
splitContainerCache = splitContainers(selectedContainerCache);
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
this.log.logSevere("selectContainersEnqueueToCloud: splitContainers failed because of too low RAM", e);
return false;
}
@ -371,30 +370,37 @@ public class Dispatcher {
* This method returns true if a container was dequeued, false if not
*/
public boolean dequeueContainer() {
if (transmissionCloud == null) return false;
if (this.indexingTransmissionProcessor.queueSize() > indexingTransmissionProcessor.concurrency()) return false;
if (this.transmissionCloud == null) return false;
if (this.indexingTransmissionProcessor.queueSize() > this.indexingTransmissionProcessor.concurrency()) return false;
ByteArray maxtarget = null;
int maxsize = -1;
for (Map.Entry<ByteArray, Transmission.Chunk> chunk: this.transmissionCloud.entrySet()) {
for (final Map.Entry<ByteArray, Transmission.Chunk> chunk: this.transmissionCloud.entrySet()) {
if (chunk.getValue().containersSize() > maxsize) {
maxsize = chunk.getValue().containersSize();
maxtarget = chunk.getKey();
}
}
if (maxsize < 0) return false;
Transmission.Chunk chunk = this.transmissionCloud.remove(maxtarget);
final Transmission.Chunk chunk = this.transmissionCloud.remove(maxtarget);
try {
this.indexingTransmissionProcessor.enQueue(chunk);
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
return true;
}
public Transmission.Chunk transferDocumentIndex(Transmission.Chunk chunk) {
/**
* transfer job: this method is called using reflection from the switchboard
* the method is called as a Workflow process. That means it is always called whenever
* a job is placed in the workflow queue. This happens in dequeueContainer()
* @param chunk
* @return
*/
public Transmission.Chunk transferDocumentIndex(final Transmission.Chunk chunk) {
// do the transmission
boolean success = chunk.transmit();
final boolean success = chunk.transmit();
if (success && chunk.isFinished()) {
// finished with this queue!
@ -407,7 +413,7 @@ public class Dispatcher {
if (chunk.canFinish()) {
try {
if (this.indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.enQueue(chunk);
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
return null;
}
@ -420,12 +426,12 @@ public class Dispatcher {
public void close() {
// removes all entries from the dispatcher and puts them back to a RAMRI
if (indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.announceShutdown();
if (this.indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.announceShutdown();
if (this.transmissionCloud != null) {
outerLoop: for (Map.Entry<ByteArray, Transmission.Chunk> e : this.transmissionCloud.entrySet()) {
for (ReferenceContainer<WordReference> i : e.getValue()) try {
outerLoop: for (final Map.Entry<ByteArray, Transmission.Chunk> e : this.transmissionCloud.entrySet()) {
for (final ReferenceContainer<WordReference> i : e.getValue()) try {
this.segment.termIndex().add(i);
} catch (Exception e1) {
} catch (final Exception e1) {
Log.logException(e1);
break outerLoop;
}
@ -433,7 +439,7 @@ public class Dispatcher {
this.transmissionCloud.clear();
}
this.transmissionCloud = null;
if (indexingTransmissionProcessor != null) {
if (this.indexingTransmissionProcessor != null) {
this.indexingTransmissionProcessor.awaitShutdown(10000);
this.indexingTransmissionProcessor.clear();
}

@ -464,17 +464,17 @@ public class Table implements Index, Iterable<Row.Entry> {
final int i = (int) this.index.get(key);
if (i == -1) return null;
final byte[] b = new byte[this.rowdef.objectsize];
if (this.table == null) {
final Row.Entry cacherow;
if (this.table == null || (cacherow = this.table.get(i, false)) == null) {
// read row from the file
this.file.get(i, b, 0);
} else {
// construct the row using the copy in RAM
final Row.Entry v = this.table.get(i, false);
assert v != null;
if (v == null) return null;
assert cacherow != null;
if (cacherow == null) return null;
assert key.length == this.rowdef.primaryKeyLength;
System.arraycopy(key, 0, b, 0, key.length);
System.arraycopy(v.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength);
System.arraycopy(cacherow.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength);
}
return this.rowdef.newEntry(b);
}
@ -503,7 +503,7 @@ public class Table implements Index, Iterable<Row.Entry> {
assert this.table == null || this.table.size() == this.index.size() : "table.size() = " + this.table.size() + ", index.size() = " + this.index.size();
assert row != null;
assert row.bytes() != null;
if ((row == null) || (row.bytes() == null)) return null;
if (row == null || row.bytes() == null) return null;
final int i = (int) this.index.get(row.getPrimaryKeyBytes());
if (i == -1) {
try {
@ -517,17 +517,17 @@ public class Table implements Index, Iterable<Row.Entry> {
}
final byte[] b = new byte[this.rowdef.objectsize];
if (this.table == null) {
Row.Entry cacherow;
if (this.table == null || (cacherow = this.table.get(i, false)) == null) {
// read old value
this.file.get(i, b, 0);
// write new value
this.file.put(i, row.bytes(), 0);
} else {
// read old value
final Row.Entry v = this.table.get(i, false);
assert v != null;
assert cacherow != null;
System.arraycopy(row.getPrimaryKeyBytes(), 0, b, 0, this.rowdef.primaryKeyLength);
System.arraycopy(v.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength);
System.arraycopy(cacherow.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength);
// write new value
try {
this.table.set(i, this.taildef.newEntry(row.bytes(), this.rowdef.primaryKeyLength, true));
@ -573,13 +573,12 @@ public class Table implements Index, Iterable<Row.Entry> {
this.file.put(i, row.bytes(), 0);
} else {
// write new value
try {
this.file.put(i, row.bytes(), 0);
if (abandonTable()) this.table = null; else try {
this.table.set(i, this.taildef.newEntry(row.bytes(), this.rowdef.primaryKeyLength, true));
} catch (final RowSpaceExceededException e) {
this.table = null;
}
if (abandonTable()) this.table = null;
this.file.put(i, row.bytes(), 0);
}
assert this.file.size() == this.index.size() : "file.size() = " + this.file.size() + ", index.size() = " + this.index.size();
assert this.table == null || this.table.size() == this.index.size() : "table.size() = " + this.table.size() + ", index.size() = " + this.index.size();
@ -665,7 +664,8 @@ public class Table implements Index, Iterable<Row.Entry> {
final int sb = this.index.size();
int ix;
assert i < this.index.size();
if (this.table == null) {
final Row.Entry cacherow;
if (this.table == null || (cacherow = this.table.get(i, false)) == null) {
if (i == this.index.size() - 1) {
// element is at last entry position
ix = (int) this.index.remove(key);
@ -697,9 +697,8 @@ public class Table implements Index, Iterable<Row.Entry> {
assert this.file.size() == this.index.size() : "file.size() = " + this.file.size() + ", index.size() = " + this.index.size();
} else {
// get result value from the table copy, so we don't need to read it from the file
final Row.Entry v = this.table.get(i, false);
System.arraycopy(key, 0, b, 0, key.length);
System.arraycopy(v.bytes(), 0, b, this.rowdef.primaryKeyLength, this.taildef.objectsize);
System.arraycopy(cacherow.bytes(), 0, b, this.rowdef.primaryKeyLength, this.taildef.objectsize);
if (i == this.index.size() - 1) {
// special handling if the entry is the last entry in the file
@ -911,7 +910,8 @@ public class Table implements Index, Iterable<Row.Entry> {
this.c = (int) Table.this.index.get(k);
if (this.c < 0) throw new ConcurrentModificationException(); // this should only happen if the table was modified during the iteration
final byte[] b = new byte[Table.this.rowdef.objectsize];
if (Table.this.table == null) {
final Row.Entry cacherow;
if (Table.this.table == null || (cacherow = Table.this.table.get(this.c, false)) == null) {
// read from file
try {
Table.this.file.get(this.c, b, 0);
@ -921,11 +921,10 @@ public class Table implements Index, Iterable<Row.Entry> {
}
} else {
// compose from table and key
final Row.Entry v = Table.this.table.get(this.c, false);
assert v != null;
if (v == null) return null;
assert cacherow != null;
if (cacherow == null) return null;
System.arraycopy(k, 0, b, 0, Table.this.rowdef.primaryKeyLength);
System.arraycopy(v.bytes(), 0, b, Table.this.rowdef.primaryKeyLength, Table.this.taildef.objectsize);
System.arraycopy(cacherow.bytes(), 0, b, Table.this.rowdef.primaryKeyLength, Table.this.taildef.objectsize);
}
return Table.this.rowdef.newEntry(b);
}

Loading…
Cancel
Save