performance hacks: more pre-allocated StringBuilder

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7790 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 87bd559c42
commit 7db208c992

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -61,10 +61,9 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.ReferenceContainerArray;
import net.yacy.kelondro.util.MemoryControl;
import de.anomic.search.MetadataRepository;
import de.anomic.search.Segment;
import de.anomic.search.MetadataRepository.Export;
import de.anomic.search.Segment;
public class URLAnalysis {
@ -78,21 +77,21 @@ public class URLAnalysis {
static {
try {
poison = new DigestURI("http://poison.org/poison");
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
poison = null;
}
}
public static class splitter extends Thread {
private ArrayBlockingQueue<DigestURI> in;
private ConcurrentHashMap<String, Integer> out;
private final ArrayBlockingQueue<DigestURI> in;
private final ConcurrentHashMap<String, Integer> out;
public splitter(final ArrayBlockingQueue<DigestURI> in, final ConcurrentHashMap<String, Integer> out) {
this.in = in;
this.out = out;
}
@Override
public void run() {
try {
@ -100,29 +99,29 @@ public class URLAnalysis {
final Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_");
while (true) {
try {
url = in.take();
url = this.in.take();
if (url == poison) break;
update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\."));
update(p.matcher(url.getPath()).replaceAll("/").split("/"));
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
}
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
}
private void update(final String[] s) {
Integer c;
for (final String t: s) {
if (t.length() == 0) continue;
c = out.get(t);
out.put(t, (c == null) ? 1 : c.intValue() + 1);
c = this.out.get(t);
this.out.put(t, (c == null) ? 1 : c.intValue() + 1);
}
}
}
public static void cleanup(final ConcurrentHashMap<String, Integer> stat) {
Map.Entry<String, Integer> entry;
int c, low = Integer.MAX_VALUE;
@ -146,7 +145,7 @@ public class URLAnalysis {
}
Runtime.getRuntime().gc();
}
public static void genstat(final String urlfile) {
final boolean gz = urlfile.endsWith(".gz");
@ -165,7 +164,7 @@ public class URLAnalysis {
final File outfile = new File(analysis);
BufferedReader reader = null;
long time = System.currentTimeMillis();
long start = time;
final long start = time;
int count = 0;
System.out.println("start processing");
@ -178,11 +177,11 @@ public class URLAnalysis {
line = line.trim();
if (line.length() > 0) {
try {
DigestURI url = new DigestURI(line);
final DigestURI url = new DigestURI(line);
in.put(url);
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
continue;
}
}
@ -208,15 +207,15 @@ public class URLAnalysis {
System.out.println("stopping threads");
for (int i = 0, available = Runtime.getRuntime().availableProcessors() + 1; i < available; i++) try {
in.put(poison);
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
try {
spl.join();
} catch (InterruptedException e1) {
} catch (final InterruptedException e1) {
Log.logException(e1);
}
// generate statistics
System.out.println("start processing results");
final TreeMap<String, Integer> results = new TreeMap<String, Integer>();
@ -233,7 +232,7 @@ public class URLAnalysis {
System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
}
}
// write statistics
System.out.println("start writing results");
try {
@ -252,13 +251,13 @@ public class URLAnalysis {
}
}
os.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
System.out.println("finished");
}
public static void genhost(final String urlfile) {
final boolean gz = urlfile.endsWith(".gz");
@ -280,9 +279,9 @@ public class URLAnalysis {
line = line.trim();
if (line.length() > 0) {
try {
DigestURI url = new DigestURI(line);
final DigestURI url = new DigestURI(line);
hosts.add(url.getHost());
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
continue;
}
}
@ -298,7 +297,7 @@ public class URLAnalysis {
} finally {
if (reader != null) try { reader.close(); } catch (final Exception e) {}
}
// copy everything into a TreeSet to order it
System.out.println("start processing results");
final TreeSet<String> results = new TreeSet<String>();
@ -313,18 +312,18 @@ public class URLAnalysis {
System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
}
}
// write hosts
writeSet(trunk, gz, results);
System.out.println("finished");
}
private static void writeSet(final String trunk, final boolean gz, final Set<String> set) {
// write hosts
System.out.println("start writing results");
File outfile = new File(trunk + ((gz) ? ".gz" : ""));
final File outfile = new File(trunk + ((gz) ? ".gz" : ""));
long time = System.currentTimeMillis();
try {
OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
@ -340,13 +339,13 @@ public class URLAnalysis {
}
}
os.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
System.out.println("finished writing results");
}
public static void sortsplit(final String urlfile) {
final boolean gz = urlfile.endsWith(".gz");
@ -370,9 +369,9 @@ public class URLAnalysis {
line = line.trim();
if (line.length() > 0) {
try {
DigestURI url = new DigestURI(line);
final DigestURI url = new DigestURI(line);
urls.add(url.toNormalform(true, true));
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
continue;
}
}
@ -394,13 +393,13 @@ public class URLAnalysis {
} finally {
if (reader != null) try { reader.close(); } catch (final Exception e) {}
}
// write hosts
writeSet(trunk + "." + filecount, gz, urls);
System.out.println("finished");
}
public static void incell(final File cellPath, final String statisticPath) {
try {
final HandleMap idx = ReferenceContainerArray.referenceHashes(
@ -412,7 +411,7 @@ public class URLAnalysis {
idx.dump(new File(statisticPath));
System.out.println("INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath);
idx.close();
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
}
@ -444,7 +443,7 @@ public class URLAnalysis {
System.out.println("INDEX DIFF URL-COL finished dump, wrote " + count + " references that occur in the URL-DB, but not in the collection-dump");
return count;
}
public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException {
// format: 0=text, 1=html, 2=rss/xml
System.out.println("URL EXPORT startup");
@ -454,12 +453,12 @@ public class URLAnalysis {
final Export e = mr.export(new File(export), ".*", hs, format, false);
try {
e.join();
} catch (InterruptedException e1) {
} catch (final InterruptedException e1) {
Log.logException(e1);
}
System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries");
}
public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException {
System.out.println("URL DELETE startup");
final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false);
@ -471,12 +470,12 @@ public class URLAnalysis {
}
System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database");
}
public static void main(final String[] args) {
if (args[0].equals("-stat") && args.length >= 2) {
// generate a statistics about common words in file, store to <file>.stat
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz
for (int i = 1; i < args.length; i++) genstat(args[i]);
} else if (args[0].equals("-host") && args.length >= 2) {
// generate a file <file>.host containing only the hosts of the urls
@ -495,7 +494,7 @@ public class URLAnalysis {
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT/METADATA used.dump diffurlcol.dump
try {
diffurlcol(args[1], args[2], args[3]);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
} else if (args[0].equals("-export") && args.length >= 4) {
@ -503,10 +502,10 @@ public class URLAnalysis {
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump
// instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0;
final int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0;
try {
export(args[1], format, args[3], (args.length >= 5) ? args[4] : null);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
} else if (args[0].equals("-delete") && args.length >= 3) {
@ -516,7 +515,7 @@ public class URLAnalysis {
// instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
try {
delete(args[1], args[2]);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
}
} else {
@ -553,9 +552,9 @@ public class URLAnalysis {
}
System.exit(0); // kill remaining threads
}
private static final String num(final int i) {
StringBuffer s = new StringBuffer(Integer.toString(i));
final StringBuilder s = new StringBuilder(Integer.toString(i));
while (s.length() < 9) s.insert(0, "0");
return s.toString();
}

@ -9,7 +9,7 @@
// $LastChangedBy: apfelmaennchen $
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -32,39 +32,38 @@ import java.util.Iterator;
import java.util.regex.Pattern;
import net.yacy.kelondro.blob.Tables;
import de.anomic.data.WorkTables;
public class YMarkCrawlStart extends HashMap<String,String>{
private static final long serialVersionUID = 1L;
private WorkTables worktables;
private final WorkTables worktables;
public YMarkCrawlStart(final WorkTables worktables) {
this.worktables = worktables;
}
public YMarkCrawlStart(final WorkTables worktables, final String url) {
this.worktables = worktables;
this.clear();
this.load(url);
clear();
load(url);
}
public void load(String url) {
public void load(final String url) {
try {
final StringBuffer buffer = new StringBuffer(500);
final StringBuilder buffer = new StringBuilder(500);
buffer.append("^.*crawlingURL=\\Q");
buffer.append(url);
buffer.append("\\E?.*");
final Pattern pattern = Pattern.compile(buffer.toString());
final Pattern pattern = Pattern.compile(buffer.toString());
final Iterator<Tables.Row> APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern);
Tables.Row row = null;
Tables.Row row = null;
while(APIcalls.hasNext()) {
row = APIcalls.next();
if(row.get(WorkTables.TABLE_API_COL_TYPE, "").equals("crawler")) {
buffer.setLength(0);
buffer.append(row.get(WorkTables.TABLE_API_COL_URL, ""));
buffer.delete(0, buffer.indexOf("?")+1);
buffer.delete(0, buffer.indexOf("?")+1);
int start = 0;
int end = 0;
String key;
@ -78,12 +77,12 @@ public class YMarkCrawlStart extends HashMap<String,String>{
end = buffer.length()-1;
value = buffer.substring(start, end);
start = end+1;
this.put(key, value);
put(key, value);
}
break;
}
}
}
} catch (IOException e) {
} catch (final IOException e) {
// TODO Auto-generated catch block
}
}

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -40,48 +40,48 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import de.anomic.data.WorkTables;
public class YMarkTables {
public static enum TABLES {
BOOKMARKS ("_bookmarks"),
TAGS ("_tags"),
FOLDERS ("_folders");
private String basename;
private TABLES(String b) {
private TABLES(final String b) {
this.basename = b;
}
public String basename() {
return this.basename;
}
public String tablename(String bmk_user) {
public String tablename(final String bmk_user) {
return bmk_user+this.basename;
}
}
public static enum PROTOCOLS {
HTTP ("http://"),
HTTPS ("https://");
private String protocol;
private PROTOCOLS(String s) {
private PROTOCOLS(final String s) {
this.protocol = s;
}
public String protocol() {
return this.protocol;
}
public String protocol(String s) {
public String protocol(final String s) {
return this.protocol+s;
}
}
public final static String FOLDERS_ROOT = "/";
public final static String FOLDERS_ROOT = "/";
public final static String BOOKMARKS_LOG = "BOOKMARKS";
public final static String USER_ADMIN = "admin";
public final static String USER_AUTHENTICATE = "AUTHENTICATE";
public final static String USER_AUTHENTICATE_MSG = "Authentication required!";
public final static String p1 = "(?:^|.*,)";
public final static String p2 = "\\Q";
public final static String p3 = "\\E";
@ -90,15 +90,15 @@ public class YMarkTables {
public final static String p6 = "),.*){";
public final static String p7 = "/.*)";
public final static String p8 = "(?:,|$)";
public final static int BUFFER_LENGTH = 256;
private final WorkTables worktables;
public YMarkTables(final Tables wt) {
this.worktables = (WorkTables)wt;
}
public void deleteBookmark(final String bmk_user, final byte[] urlHash) throws IOException, RowSpaceExceededException {
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
Tables.Row bmk_row = null;
@ -107,11 +107,11 @@ public class YMarkTables {
this.worktables.delete(bmk_table,urlHash);
}
}
public void deleteBookmark(final String bmk_user, final String url) throws IOException, RowSpaceExceededException {
this.deleteBookmark(bmk_user, YMarkUtil.getBookmarkId(url));
}
public TreeMap<String, YMarkTag> getTags(final Iterator<Row> rowIterator) {
final TreeMap<String,YMarkTag> tags = new TreeMap<String,YMarkTag>();
Tables.Row bmk_row = null;
@ -133,14 +133,14 @@ public class YMarkTables {
}
return tags;
}
public TreeMap<String, YMarkTag> getTags(final String bmk_user) throws IOException {
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
final TreeMap<String,YMarkTag> tags = getTags(this.worktables.iterator(bmk_table));
return tags;
}
public TreeSet<String> getFolders(final String bmk_user, final String root) throws IOException {
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
final TreeSet<String> folders = new TreeSet<String>();
@ -155,11 +155,11 @@ public class YMarkTables {
final Pattern r = Pattern.compile(patternBuilder.toString());
final Iterator<Tables.Row> bit = this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.FOLDERS.key(), r);
Tables.Row bmk_row = null;
while(bit.hasNext()) {
bmk_row = bit.next();
if(bmk_row.containsKey(YMarkEntry.BOOKMARK.FOLDERS.key())) {
final String[] folderArray = (new String(bmk_row.get(YMarkEntry.BOOKMARK.FOLDERS.key()),"UTF8")).split(YMarkUtil.TAGS_SEPARATOR);
if(bmk_row.containsKey(YMarkEntry.BOOKMARK.FOLDERS.key())) {
final String[] folderArray = (new String(bmk_row.get(YMarkEntry.BOOKMARK.FOLDERS.key()),"UTF8")).split(YMarkUtil.TAGS_SEPARATOR);
for (final String folder : folderArray) {
if(folder.length() > root.length() && folder.substring(0, root.length()+1).equals(root+'/')) {
if(!folders.contains(folder)) {
@ -167,21 +167,21 @@ public class YMarkTables {
path.append(folder);
//TODO: get rid of .toString.equals()
while(path.length() > 0 && !path.toString().equals(root)){
folders.add(path.toString());
folders.add(path.toString());
path.setLength(path.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR));
}
}
}
}
}
}
}
}
if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); }
if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); }
return folders;
}
public Iterator<Tables.Row> getBookmarksByFolder(final String bmk_user, final String folder) throws IOException {
public Iterator<Tables.Row> getBookmarksByFolder(final String bmk_user, final String folder) throws IOException {
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
final StringBuffer patternBuilder = new StringBuffer(BUFFER_LENGTH);
final StringBuilder patternBuilder = new StringBuilder(BUFFER_LENGTH);
patternBuilder.setLength(0);
patternBuilder.append(p1);
patternBuilder.append('(');
@ -193,10 +193,10 @@ public class YMarkTables {
final Pattern p = Pattern.compile(patternBuilder.toString());
return this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.FOLDERS.key(), p);
}
public Iterator<Tables.Row> getBookmarksByTag(final String bmk_user, final String[] tagArray) throws IOException {
public Iterator<Tables.Row> getBookmarksByTag(final String bmk_user, final String[] tagArray) throws IOException {
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
final StringBuffer patternBuilder = new StringBuffer(BUFFER_LENGTH);
final StringBuilder patternBuilder = new StringBuilder(BUFFER_LENGTH);
patternBuilder.setLength(0);
patternBuilder.append(p1);
patternBuilder.append(p5);
@ -213,9 +213,9 @@ public class YMarkTables {
final Pattern p = Pattern.compile(patternBuilder.toString());
return this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.TAGS.key(), p);
}
public SortedSet<Row> orderBookmarksBy(final Iterator<Row> rowIterator, final String sortname, final String sortorder) {
TreeSet<Row> sortTree = new TreeSet<Tables.Row>(new TablesRowComparator(sortname));
final TreeSet<Row> sortTree = new TreeSet<Tables.Row>(new TablesRowComparator(sortname));
Row row;
while (rowIterator.hasNext()) {
row = rowIterator.next();
@ -226,36 +226,36 @@ public class YMarkTables {
return sortTree.descendingSet();
return sortTree;
}
public void addTags(final String bmk_user, final String url, final String tagString, final boolean merge) throws IOException, RowSpaceExceededException {
if(!tagString.isEmpty()) {
// do not set defaults as we only want to update tags
final YMarkEntry bmk = new YMarkEntry(false);
bmk.put(YMarkEntry.BOOKMARK.URL.key(), url);
bmk.put(YMarkEntry.BOOKMARK.URL.key(), url);
bmk.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkUtil.cleanTagsString(tagString));
this.addBookmark(bmk_user, bmk, merge, true);
}
addBookmark(bmk_user, bmk, merge, true);
}
}
public void addFolder(final String bmk_user, final String url, final String folder) throws IOException, RowSpaceExceededException {
if(!folder.isEmpty()) {
// do not set defaults as we only want to add a folder
final YMarkEntry bmk = new YMarkEntry(false);
bmk.put(YMarkEntry.BOOKMARK.URL.key(), url);
bmk.put(YMarkEntry.BOOKMARK.URL.key(), url);
bmk.put(YMarkEntry.BOOKMARK.FOLDERS.key(), folder);
this.addBookmark(bmk_user, bmk, true, true);
}
addBookmark(bmk_user, bmk, true, true);
}
}
public void visited(final String bmk_user, final String url) throws IOException, RowSpaceExceededException {
// do not set defaults
final YMarkEntry bmk = new YMarkEntry(false);
bmk.put(YMarkEntry.BOOKMARK.URL.key(), url);
bmk.put(YMarkEntry.BOOKMARK.URL.key(), url);
bmk.put(YMarkEntry.BOOKMARK.DATE_VISITED.key(), (new YMarkDate()).toString());
this.addBookmark(bmk_user, bmk, true, true);
addBookmark(bmk_user, bmk, true, true);
}
public void addBookmark(final String bmk_user, final YMarkEntry bmk, final boolean mergeTags, final boolean mergeFolders) throws IOException, RowSpaceExceededException {
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
final String date = String.valueOf(System.currentTimeMillis());
@ -267,19 +267,19 @@ public class YMarkTables {
if (bmk_row == null) {
// create and insert new entry
this.worktables.insert(bmk_table, urlHash, bmk.getData());
} else {
} else {
// modify and update existing entry
HashSet<String> oldSet;
HashSet<String> newSet;
for (YMarkEntry.BOOKMARK b : YMarkEntry.BOOKMARK.values()) {
for (final YMarkEntry.BOOKMARK b : YMarkEntry.BOOKMARK.values()) {
switch(b) {
case DATE_ADDED:
if(!bmk_row.containsKey(b.key()))
bmk_row.put(b.key(), date);
bmk_row.put(b.key(), date);
break;
case DATE_MODIFIED:
bmk_row.put(b.key(), date);
bmk_row.put(b.key(), date);
break;
case TAGS:
oldSet = YMarkUtil.keysStringToSet(bmk_row.get(b.key(),b.deflt()));
@ -295,7 +295,7 @@ public class YMarkTables {
}
} else {
bmk_row.put(b.key(), bmk_row.get(b.key(), b.deflt()));
}
}
break;
case FOLDERS:
oldSet = YMarkUtil.keysStringToSet(bmk_row.get(b.key(),b.deflt()));
@ -312,7 +312,7 @@ public class YMarkTables {
} else {
bmk_row.put(b.key(), bmk_row.get(b.key(), b.deflt()));
}
break;
break;
default:
if(bmk.containsKey(b.key())) {
bmk_row.put(b.key(), bmk.get(b.key()));
@ -322,7 +322,7 @@ public class YMarkTables {
}
}
// update bmk_table
this.worktables.update(bmk_table, bmk_row);
this.worktables.update(bmk_table, bmk_row);
}
}
}

@ -1377,7 +1377,7 @@ public final class HTTPDFileHandler {
final Pattern p = Pattern.compile("(href=\"|src=\")([^\"]+)|(href='|src=')([^']+)|(url\\(')([^']+)|(url\\(\")([^\"]+)|(url\\()([^\\)]+)");
final Matcher m = p.matcher(sbuffer);
final StringBuffer result = new StringBuffer();
final StringBuffer result = new StringBuffer(80);
while (m.find()) {
String init = null;
if(m.group(1) != null) init = m.group(1);

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -37,12 +37,13 @@ import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Scanner;
import net.yacy.cora.storage.ConcurrentScoreMap;
import net.yacy.cora.storage.ClusteredScoreMap;
import net.yacy.cora.storage.ConcurrentScoreMap;
import net.yacy.cora.storage.ScoreMap;
import net.yacy.cora.storage.WeakPriorityBlockingQueue;
import net.yacy.cora.storage.WeakPriorityBlockingQueue.ReverseElement;
@ -55,13 +56,12 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.EventTracker;
import de.anomic.yacy.graphics.ProfilingGraph;
public final class RankingProcess extends Thread {
private static final int maxDoubleDomAll = 1000, maxDoubleDomSpecial = 10000;
private final QueryParams query;
private final SortedSet<byte[]> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private final int[] flagcount; // flag counter
@ -69,14 +69,14 @@ public final class RankingProcess extends Thread {
private int sortout; // counter for referenced that had been sorted out for other reasons
//private final int[] domZones;
private SortedMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
private int remote_resourceSize, remote_indexCount, remote_peerCount;
private int local_resourceSize, local_indexCount;
private final WeakPriorityBlockingQueue<WordReferenceVars> stack;
private int feeders;
private final ConcurrentHashMap<String, WeakPriorityBlockingQueue<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
//private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process
private final ScoreMap<String> ref; // reference score computation for the commonSense heuristic
private final ScoreMap<String> hostNavigator; // a counter for the appearance of the host hash
private final Map<String, byte[]> hostResolver; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash
@ -85,7 +85,7 @@ public final class RankingProcess extends Thread {
private final ReferenceOrder order;
private final long startTime;
private boolean addRunning;
public RankingProcess(final QueryParams query, final ReferenceOrder order, final int maxentries) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
@ -116,36 +116,36 @@ public final class RankingProcess extends Thread {
this.feeders = 1;
this.startTime = System.currentTimeMillis();
}
public QueryParams getQuery() {
return this.query;
}
public ReferenceOrder getOrder() {
return this.order;
}
@Override
public void run() {
// do a search
// sort the local containers and truncate it to a limited count,
// so following sortings together with the global results will be fast
try {
long timer = System.currentTimeMillis();
final long timer = System.currentTimeMillis();
final TermSearch<WordReference> search = this.query.getSegment().termIndex().query(
query.queryHashes,
query.excludeHashes,
this.query.queryHashes,
this.query.excludeHashes,
null,
Segment.wordReferenceFactory,
query.maxDistance);
this.query.maxDistance);
this.localSearchInclusion = search.inclusion();
final ReferenceContainer<WordReference> index = search.joined();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.JOIN, query.queryString, index.size(), System.currentTimeMillis() - timer), false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.JOIN, this.query.queryString, index.size(), System.currentTimeMillis() - timer), false);
if (index.isEmpty()) {
return;
}
add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, true);
} catch (final Exception e) {
Log.logException(e);
@ -153,21 +153,21 @@ public final class RankingProcess extends Thread {
oneFeederTerminated();
}
}
public void add(
final ReferenceContainer<WordReference> index,
final boolean local,
String resourceName,
final String resourceName,
final int fullResource,
final boolean finalizeAddAtEnd) {
// we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime
this.addRunning = true;
assert (index != null);
if (index.isEmpty()) return;
if (local) {
this.local_resourceSize += index.size();
} else {
@ -175,16 +175,16 @@ public final class RankingProcess extends Thread {
this.remote_resourceSize += fullResource;
this.remote_peerCount++;
}
long timer = System.currentTimeMillis();
// normalize entries
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.NORMALIZING, resourceName, index.size(), System.currentTimeMillis() - timer), false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.NORMALIZING, resourceName, index.size(), System.currentTimeMillis() - timer), false);
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
// apply all constraints
try {
@ -197,7 +197,7 @@ public final class RankingProcess extends Thread {
// increase flag counts
for (int j = 0; j < 32; j++) {
if (iEntry.flags().get(j)) {flagcount[j]++;}
if (iEntry.flags().get(j)) {this.flagcount[j]++;}
}
// check constraints
@ -206,11 +206,11 @@ public final class RankingProcess extends Thread {
}
// check document domain
if (query.contentdom != ContentDomain.TEXT) {
if ((query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue; }
if ((query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue; }
if ((query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue; }
if ((query.contentdom == ContentDomain.APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) { continue; }
if (this.query.contentdom != ContentDomain.TEXT) {
if ((this.query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue; }
if ((this.query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue; }
if ((this.query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue; }
if ((this.query.contentdom == ContentDomain.APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) { continue; }
}
// check tld domain
@ -221,32 +221,32 @@ public final class RankingProcess extends Thread {
continue;
}
*/
// count domZones
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
// check site constraints
String hosthash = iEntry.hosthash();
if (query.sitehash == null) {
final String hosthash = iEntry.hosthash();
if (this.query.sitehash == null) {
// no site constraint there; maybe collect host navigation information
if (nav_hosts && query.urlMask_isCatchall) {
if (nav_hosts && this.query.urlMask_isCatchall) {
this.hostNavigator.inc(hosthash);
this.hostResolver.put(hosthash, iEntry.urlhash());
}
} else {
if (!hosthash.equals(query.sitehash)) {
if (!hosthash.equals(this.query.sitehash)) {
// filter out all domains that do not match with the site constraint
continue;
}
}
// finally make a double-check and insert result to stack
if (urlhashes.add(iEntry.urlhash())) {
if (this.urlhashes.add(iEntry.urlhash())) {
rankingtryloop: while (true) {
try {
stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
this.stack.put(new ReverseElement<WordReferenceVars>(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest)
break rankingtryloop;
} catch (ArithmeticException e) {
} catch (final ArithmeticException e) {
// this may happen if the concurrent normalizer changes values during cardinal computation
continue rankingtryloop;
}
@ -256,14 +256,14 @@ public final class RankingProcess extends Thread {
}
}
} catch (InterruptedException e) {} finally {
} catch (final InterruptedException e) {} finally {
if (finalizeAddAtEnd) this.addRunning = false;
}
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.PRESORT, resourceName, index.size(), System.currentTimeMillis() - timer), false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.PRESORT, resourceName, index.size(), System.currentTimeMillis() - timer), false);
}
/**
* method to signal the incoming stack that one feeder has terminated
*/
@ -271,56 +271,56 @@ public final class RankingProcess extends Thread {
this.feeders--;
assert this.feeders >= 0 : "feeders = " + this.feeders;
}
protected void moreFeeders(final int countMoreFeeders) {
this.feeders += countMoreFeeders;
}
public boolean feedingIsFinished() {
return System.currentTimeMillis() - this.startTime > 50 && this.feeders == 0;
}
private boolean testFlags(final WordReference ientry) {
if (query.constraint == null) return true;
if (this.query.constraint == null) return true;
// test if ientry matches with filter
// if all = true: let only entries pass that has all matching bits
// if all = false: let all entries pass that has at least one matching bit
if (query.allofconstraint) {
if (this.query.allofconstraint) {
for (int i = 0; i < 32; i++) {
if ((query.constraint.get(i)) && (!ientry.flags().get(i))) return false;
if ((this.query.constraint.get(i)) && (!ientry.flags().get(i))) return false;
}
return true;
}
for (int i = 0; i < 32; i++) {
if ((query.constraint.get(i)) && (ientry.flags().get(i))) return true;
if ((this.query.constraint.get(i)) && (ientry.flags().get(i))) return true;
}
return false;
}
protected Map<byte[], ReferenceContainer<WordReference>> searchContainerMap() {
// direct access to the result maps is needed for abstract generation
// this is only available if execQuery() was called before
return localSearchInclusion;
return this.localSearchInclusion;
}
private WeakPriorityBlockingQueue.Element<WordReferenceVars> takeRWI(final boolean skipDoubleDom, final long waitingtime) {
// returns from the current RWI list the best entry and removes this entry from the list
WeakPriorityBlockingQueue<WordReferenceVars> m;
WeakPriorityBlockingQueue.Element<WordReferenceVars> rwi = null;
// take one entry from the stack if there are entries on that stack or the feeding is not yet finished
try {
//System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue());
int loops = 0; // a loop counter to terminate the reading if all the results are from the same domain
long timeout = System.currentTimeMillis() + waitingtime;
while (((!feedingIsFinished() && this.addRunning) || stack.sizeQueue() > 0) &&
final long timeout = System.currentTimeMillis() + waitingtime;
while (((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) &&
(this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage)) {
if (waitingtime <= 0) {
rwi = stack.poll();
rwi = this.stack.poll();
} else timeoutloop:while (System.currentTimeMillis() < timeout) {
if (feedingIsFinished() && stack.sizeQueue() == 0) break timeoutloop;
rwi = stack.poll(50);
if (feedingIsFinished() && this.stack.sizeQueue() == 0) break timeoutloop;
rwi = this.stack.poll(50);
if (rwi != null) break timeoutloop;
}
if (rwi == null) break;
@ -328,14 +328,14 @@ public final class RankingProcess extends Thread {
//System.out.println("!skipDoubleDom");
return rwi;
}
// check doubledom
final String hosthash = rwi.getElement().hosthash();
synchronized (this.doubleDomCache) {
m = this.doubleDomCache.get(hosthash);
if (m == null) {
// first appearance of dom. we create an entry to signal that one of that domain was already returned
m = new WeakPriorityBlockingQueue<WordReferenceVars>((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
m = new WeakPriorityBlockingQueue<WordReferenceVars>((this.query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll);
this.doubleDomCache.put(hosthash, m);
return rwi;
}
@ -343,9 +343,9 @@ public final class RankingProcess extends Thread {
m.put(rwi);
}
}
} catch (InterruptedException e1) {}
} catch (final InterruptedException e1) {}
if (this.doubleDomCache.isEmpty()) return null;
// no more entries in sorted RWI entries. Now take Elements from the doubleDomCache
// find best entry from all caches
WeakPriorityBlockingQueue.Element<WordReferenceVars> bestEntry = null;
@ -355,7 +355,7 @@ public final class RankingProcess extends Thread {
while (i.hasNext()) {
try {
m = i.next();
} catch (ConcurrentModificationException e) {
} catch (final ConcurrentModificationException e) {
Log.logException(e);
continue; // not the best solution...
}
@ -372,14 +372,14 @@ public final class RankingProcess extends Thread {
}
}
if (bestEntry == null) return null;
// finally remove the best entry from the doubledom cache
m = this.doubleDomCache.get(bestEntry.getElement().hosthash());
bestEntry = m.poll();
}
return bestEntry;
}
/**
* get one metadata entry from the ranked results. This will be the 'best' entry so far
* according to the applied ranking. If there are no more entries left or the timeout
@ -400,26 +400,26 @@ public final class RankingProcess extends Thread {
if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi);
if (page == null) {
misses.add(obrwi.getElement().urlhash());
this.misses.add(obrwi.getElement().urlhash());
continue;
}
// prepare values for constraint check
final URIMetadataRow.Components metadata = page.metadata();
// check errors
if (metadata == null) {
this.sortout++;
continue; // rare case where the url is corrupted
}
if (!query.urlMask_isCatchall) {
if (!this.query.urlMask_isCatchall) {
// check url mask
if (!metadata.matches(query.urlMask)) {
if (!metadata.matches(this.query.urlMask)) {
this.sortout++;
continue;
}
// in case that we do not have e catchall filter for urls
// we must also construct the domain navigator here
//if (query.sitehash == null) {
@ -427,7 +427,7 @@ public final class RankingProcess extends Thread {
// this.hostResolver.put(UTF8.String(urlhash, 6, 6), UTF8.String(urlhash));
//}
}
// check for more errors
if (metadata.url() == null) {
this.sortout++;
@ -439,18 +439,18 @@ public final class RankingProcess extends Thread {
final String pagetitle = metadata.dc_title().toLowerCase();
// check exclusion
if ((QueryParams.anymatch(pagetitle, query.excludeHashes)) ||
(QueryParams.anymatch(pageurl.toLowerCase(), query.excludeHashes)) ||
(QueryParams.anymatch(pageauthor.toLowerCase(), query.excludeHashes))) {
if ((QueryParams.anymatch(pagetitle, this.query.excludeHashes)) ||
(QueryParams.anymatch(pageurl.toLowerCase(), this.query.excludeHashes)) ||
(QueryParams.anymatch(pageauthor.toLowerCase(), this.query.excludeHashes))) {
this.sortout++;
continue;
}
// check index-of constraint
if ((query.constraint != null) &&
(query.constraint.get(Condenser.flag_cat_indexof)) &&
if ((this.query.constraint != null) &&
(this.query.constraint.get(Condenser.flag_cat_indexof)) &&
(!(pagetitle.startsWith("index of")))) {
final Iterator<byte[]> wi = query.queryHashes.iterator();
final Iterator<byte[]> wi = this.query.queryHashes.iterator();
while (wi.hasNext()) {
this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash());
}
@ -459,41 +459,41 @@ public final class RankingProcess extends Thread {
}
// check location constraint
if ((query.constraint != null) &&
(query.constraint.get(Condenser.flag_cat_haslocation)) &&
if ((this.query.constraint != null) &&
(this.query.constraint.get(Condenser.flag_cat_haslocation)) &&
(metadata.lat() == 0.0f || metadata.lon() == 0.0f)) {
this.sortout++;
continue;
}
// check content domain
if ((query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) ||
(query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) ||
(query.contentdom == ContentDomain.IMAGE && page.limage() == 0) ||
(query.contentdom == ContentDomain.APP && page.lapp() == 0)) {
if ((this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) ||
(this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) ||
(this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0) ||
(this.query.contentdom == ContentDomain.APP && page.lapp() == 0)) {
this.sortout++;
continue;
}
// evaluate information of metadata for navigation
// author navigation:
if (pageauthor != null && pageauthor.length() > 0) {
// add author to the author navigator
String authorhash = ASCII.String(Word.word2hash(pageauthor));
final String authorhash = ASCII.String(Word.word2hash(pageauthor));
// check if we already are filtering for authors
if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) {
this.sortout++;
continue;
}
// add author to the author navigator
this.authorNavigator.inc(pageauthor);
} else if (this.query.authorhash != null) {
this.sortout++;
continue;
}
// namespace navigation
String pagepath = metadata.url().getPath();
if ((p = pagepath.indexOf(':')) >= 0) {
@ -504,49 +504,49 @@ public final class RankingProcess extends Thread {
this.namespaceNavigator.inc(pagepath);
}
}
// check Scanner
if (!Scanner.acceptURL(metadata.url())) {
this.sortout++;
continue;
}
// accept url
return page;
}
return null;
}
public int sizeQueue() {
int c = stack.sizeQueue();
for (WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
int c = this.stack.sizeQueue();
for (final WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
c += s.sizeQueue();
}
return c;
}
public int sizeAvailable() {
int c = stack.sizeAvailable();
for (WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
int c = this.stack.sizeAvailable();
for (final WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
c += s.sizeAvailable();
}
return c;
}
public boolean isEmpty() {
if (!stack.isEmpty()) return false;
for (WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
if (!this.stack.isEmpty()) return false;
for (final WeakPriorityBlockingQueue<WordReferenceVars> s: this.doubleDomCache.values()) {
if (!s.isEmpty()) return false;
}
return true;
}
public int[] flagCount() {
return flagcount;
return this.flagcount;
}
// "results from a total number of <remote_resourceSize + local_resourceSize> known (<local_resourceSize> local, <remote_resourceSize> remote), <remote_indexCount> links from <remote_peerCount> other YaCy peers."
public int filteredCount() {
// the number of index entries that are considered as result set
return this.stack.sizeAvailable();
@ -556,44 +556,44 @@ public final class RankingProcess extends Thread {
// the number of results in the local peer after filtering
return this.local_indexCount;
}
public int getRemoteIndexCount() {
// the number of result contributions from all the remote peers
return this.remote_indexCount;
}
public int getRemoteResourceSize() {
// the number of all hits in all the remote peers
return Math.max(this.remote_resourceSize, this.remote_indexCount);
}
public int getRemotePeerCount() {
// the number of remote peers that have contributed
return this.remote_peerCount;
}
public Iterator<byte[]> miss() {
return this.misses.iterator();
}
public int getMissCount() {
return this.misses.size();
}
public int getSortOutCount() {
return this.sortout;
}
public ScoreMap<String> getNamespaceNavigator() {
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace") < 0) return new ClusteredScoreMap<String>();
if (this.namespaceNavigator.sizeSmaller(2)) this.namespaceNavigator.clear(); // navigators with one entry are not useful
return this.namespaceNavigator;
}
public ScoreMap<String> getHostNavigator() {
ScoreMap<String> result = new ConcurrentScoreMap<String>();
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return result;
final Iterator<String> domhashs = this.hostNavigator.keys(false);
URIMetadataRow row;
byte[] urlhash;
@ -613,14 +613,14 @@ public final class RankingProcess extends Thread {
}
public static final Comparator<Map.Entry<String, Integer>> mecomp = new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
public int compare(final Map.Entry<String, Integer> o1, final Map.Entry<String, Integer> o2) {
if (o1.getValue().intValue() < o2.getValue().intValue()) return 1;
if (o2.getValue().intValue() < o1.getValue().intValue()) return -1;
return 0;
}
};
public ScoreMap<String> getTopicNavigator(int count) {
public ScoreMap<String> getTopicNavigator(final int count) {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
final ScoreMap<String> result = new ConcurrentScoreMap<String>();
@ -645,38 +645,40 @@ public final class RankingProcess extends Thread {
counts.put(word, q);
}
}
if (max > min) for (Map.Entry<String, Float> ce: counts.entrySet()) {
if (max > min) for (final Map.Entry<String, Float> ce: counts.entrySet()) {
result.set(ce.getKey(), (int) (((double) count) * (ce.getValue() - min) / (max - min)));
}
return this.ref;
}
private final static Pattern lettermatch = Pattern.compile("[a-z]+");
public void addTopic(final String[] words) {
String word;
for (final String w : words) {
word = w.toLowerCase();
if (word.length() > 2 &&
"http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off".indexOf(word) < 0 &&
!query.queryHashes.has(Word.word2hash(word)) &&
word.matches("[a-z]+") &&
!this.query.queryHashes.has(Word.word2hash(word)) &&
lettermatch.matcher(word).matches() &&
!Switchboard.badwords.contains(word) &&
!Switchboard.stopwords.contains(word)) {
ref.inc(word);
this.ref.inc(word);
}
}
}
protected void addTopics(final ResultEntry resultEntry) {
// take out relevant information for reference computation
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
//final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description
// add references
//addTopic(urlcomps);
addTopic(descrcomps);
}
public ScoreMap<String> getAuthorNavigator() {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls

@ -1,4 +1,4 @@
// Dispatcher.java
// Dispatcher.java
// ------------------------------
// part of YaCy
// (C) 2009 by Michael Peter Christen; mc@yacy.net
@ -28,6 +28,7 @@ package de.anomic.yacy.dht;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
@ -41,11 +42,9 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import de.anomic.search.Segment;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
import java.util.List;
public class Dispatcher {
@ -80,30 +79,30 @@ public class Dispatcher {
* or the target queue runs out of entries. If the target queue is empty, the transmission is
* called failed. In case of a fail, the RWI fragment is put back into the backend index structure
*/
// a cloud is a cache for the objects that wait to be transmitted
// the String-key is the primary target as contained in the Entry
private Map<ByteArray, Transmission.Chunk> transmissionCloud;
// the segment backend is used to store the remaining indexContainers in case that the object is closed
private final Segment segment;
// the seed database
private final yacySeedDB seeds;
// the log
private final Log log;
// transmission process
private WorkflowProcessor<Transmission.Chunk> indexingTransmissionProcessor;
// transmission object
private final Transmission transmission;
public Dispatcher(
final Segment segment,
final yacySeedDB seeds,
final boolean gzipBody,
final boolean gzipBody,
final int timeout
) {
this.transmissionCloud = new ConcurrentHashMap<ByteArray, Transmission.Chunk>();
@ -111,28 +110,28 @@ public class Dispatcher {
this.seeds = seeds;
this.log = new Log("INDEX-TRANSFER-DISPATCHER");
this.transmission = new Transmission(
log,
this.log,
segment,
seeds,
gzipBody,
timeout);
int concurrentSender = Math.min(32, Math.max(10, WorkflowProcessor.availableCPU));
indexingTransmissionProcessor = new WorkflowProcessor<Transmission.Chunk>(
final int concurrentSender = Math.min(32, Math.max(10, WorkflowProcessor.availableCPU));
this.indexingTransmissionProcessor = new WorkflowProcessor<Transmission.Chunk>(
"transferDocumentIndex",
"This is the RWI transmission process",
new String[]{"RWI/Cache/Collections"},
this, "transferDocumentIndex", concurrentSender * 2, null, concurrentSender);
}
public int cloudSize() {
return (this.transmissionCloud == null) ? 0 : this.transmissionCloud.size();
}
public int transmissionSize() {
return (this.indexingTransmissionProcessor == null) ? 0 : this.indexingTransmissionProcessor.queueSize();
}
/**
* PROCESS(1)
* select a number of index containers from the backend index.
@ -150,15 +149,15 @@ public class Dispatcher {
final int maxContainerCount,
final int maxReferenceCount,
final int maxtime) throws IOException {
// prefer file
ArrayList<ReferenceContainer<WordReference>> containers = selectContainers(hash, limitHash, maxContainerCount, maxReferenceCount, maxtime, false);
final ArrayList<ReferenceContainer<WordReference>> containers = selectContainers(hash, limitHash, maxContainerCount, maxReferenceCount, maxtime, false);
// if ram does not provide any result, take from file
//if (containers.isEmpty()) containers = selectContainers(hash, limitHash, maxContainerCount, maxtime, false);
return containers;
}
private ArrayList<ReferenceContainer<WordReference>> selectContainers(
final byte[] hash,
final byte[] limitHash,
@ -166,9 +165,9 @@ public class Dispatcher {
final int maxReferenceCount,
final int maxtime,
final boolean ram) throws IOException {
final ArrayList<ReferenceContainer<WordReference>> containers = new ArrayList<ReferenceContainer<WordReference>>(maxContainerCount);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = this.segment.termIndex().references(hash, true, ram);
ReferenceContainer<WordReference> container;
int refcount = 0;
@ -183,7 +182,7 @@ public class Dispatcher {
((container = indexContainerIterator.next()) != null) &&
((containers.isEmpty()) ||
(Base64Order.enhancedCoder.compare(container.getTermHash(), limitHash) < 0))
) {
if (container.isEmpty()) continue;
refcount += container.size();
@ -193,12 +192,12 @@ public class Dispatcher {
final ArrayList<ReferenceContainer<WordReference>> rc;
if (ram) {
// selection was only from ram, so we have to carefully remove only the selected entries
HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
Iterator<WordReference> it;
for (ReferenceContainer<WordReference> c: containers) {
for (final ReferenceContainer<WordReference> c: containers) {
urlHashes.clear();
it = c.entries();
while (it.hasNext()) try { urlHashes.put(it.next().urlhash()); } catch (RowSpaceExceededException e) { Log.logException(e); }
while (it.hasNext()) try { urlHashes.put(it.next().urlhash()); } catch (final RowSpaceExceededException e) { Log.logException(e); }
if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + ASCII.String(c.getTermHash()) + "'");
if (!urlHashes.isEmpty()) this.segment.termIndex().remove(c.getTermHash(), urlHashes);
}
@ -207,7 +206,7 @@ public class Dispatcher {
// selection was from whole index, so we can just delete the whole container
// but to avoid race conditions return the results from the deletes
rc = new ArrayList<ReferenceContainer<WordReference>>(containers.size());
for (ReferenceContainer<WordReference> c: containers) {
for (final ReferenceContainer<WordReference> c: containers) {
container = this.segment.termIndex().delete(c.getTermHash()); // be aware this might be null!
if (container != null && !container.isEmpty()) {
if (this.log.isFine()) this.log.logFine("selected " + container.size() + " urls for word '" + ASCII.String(c.getTermHash()) + "'");
@ -215,7 +214,7 @@ public class Dispatcher {
}
}
}
// finished. The caller must take care of the containers and must put them back if not needed
return rc;
}
@ -226,33 +225,33 @@ public class Dispatcher {
* @param containers
* @param scheme
* @return
* @throws RowSpaceExceededException
* @throws RowSpaceExceededException
*/
@SuppressWarnings("unchecked")
private List<ReferenceContainer<WordReference>>[] splitContainers(List<ReferenceContainer<WordReference>> containers) throws RowSpaceExceededException {
private List<ReferenceContainer<WordReference>>[] splitContainers(final List<ReferenceContainer<WordReference>> containers) throws RowSpaceExceededException {
// init the result vector
int partitionCount = this.seeds.scheme.verticalPartitions();
List<ReferenceContainer<WordReference>>[] partitions = (ArrayList<ReferenceContainer<WordReference>>[]) new ArrayList[partitionCount];
final int partitionCount = this.seeds.scheme.verticalPartitions();
final List<ReferenceContainer<WordReference>>[] partitions = new ArrayList[partitionCount];
for (int i = 0; i < partitions.length; i++) partitions[i] = new ArrayList<ReferenceContainer<WordReference>>();
// check all entries and split them to the partitions
ReferenceContainer<WordReference>[] partitionBuffer = new ReferenceContainer[partitionCount];
final ReferenceContainer<WordReference>[] partitionBuffer = new ReferenceContainer[partitionCount];
WordReference re;
for (ReferenceContainer<WordReference> container: containers) {
for (final ReferenceContainer<WordReference> container: containers) {
// init the new partitions
for (int j = 0; j < partitionBuffer.length; j++) {
partitionBuffer[j] = new ReferenceContainer<WordReference>(Segment.wordReferenceFactory, container.getTermHash(), container.size() / partitionCount);
}
// split the container
Iterator<WordReference> i = container.entries();
final Iterator<WordReference> i = container.entries();
while (i.hasNext()) {
re = i.next();
if (re == null) continue;
partitionBuffer[this.seeds.scheme.verticalPosition(re.urlhash())].add(re);
}
// add the containers to the result vector
for (int j = 0; j < partitionBuffer.length; j++) {
partitions[j].add(partitionBuffer[j]);
@ -260,7 +259,7 @@ public class Dispatcher {
}
return partitions;
}
/**
* PROCESS(3) and PROCESS(4)
* put containers into cloud. This needs information about the network,
@ -272,7 +271,7 @@ public class Dispatcher {
* then no additional IO is necessary.
*/
private void enqueueContainersToCloud(final List<ReferenceContainer<WordReference>>[] containers) {
if (transmissionCloud == null) return;
if (this.transmissionCloud == null) return;
ReferenceContainer<WordReference> lastContainer;
byte[] primaryTarget;
ByteArray pTArray;
@ -283,16 +282,16 @@ public class Dispatcher {
primaryTarget = FlatWordPartitionScheme.positionToHash(this.seeds.scheme.dhtPosition(lastContainer.getTermHash(), vertical));
assert primaryTarget[2] != '@';
pTArray = new ByteArray(primaryTarget);
// get or make a entry object
entry = this.transmissionCloud.get(pTArray); // if this is not null, the entry is extended here
List<yacySeed> targets = PeerSelection.getAcceptRemoteIndexSeedsList(
seeds,
final List<yacySeed> targets = PeerSelection.getAcceptRemoteIndexSeedsList(
this.seeds,
primaryTarget,
seeds.redundancy() * 3,
this.seeds.redundancy() * 3,
true);
this.log.logInfo("enqueueContainers: selected " + targets.size() + " targets for primary target key " + ASCII.String(primaryTarget) + "/" + vertical + " with " + containers[vertical].size() + " index containers.");
if (entry == null) entry = transmission.newChunk(primaryTarget, targets);
if (entry == null) entry = this.transmission.newChunk(primaryTarget, targets);
/*/ lookup targets
int sc = 1;
@ -303,17 +302,17 @@ public class Dispatcher {
this.log.logInfo("enqueueContainers: distance to first container at position " + sc + ": " + FlatWordPartitionScheme.std.dhtDistance(FlatWordPartitionScheme.positionToHash(this.seeds.scheme.dhtPosition(containers[vertical].get(0).getTermHash(), vertical)), null, seed));
sc++;
}*/
// fill the entry with the containers
for (ReferenceContainer<WordReference> c: containers[vertical]) {
for (final ReferenceContainer<WordReference> c: containers[vertical]) {
try {
entry.add(c);
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
Log.logException(e);
break;
}
}
// put the entry into the cloud
if (entry.containersSize() > 0) this.transmissionCloud.put(pTArray, entry);
}
@ -330,12 +329,12 @@ public class Dispatcher {
List<ReferenceContainer<WordReference>> selectedContainerCache;
try {
selectedContainerCache = selectContainers(hash, limitHash, maxContainerCount, maxReferenceCount, maxtime);
} catch (IOException e) {
} catch (final IOException e) {
this.log.logSevere("selectContainersEnqueueToCloud: selectedContainer failed", e);
return false;
}
this.log.logInfo("selectContainersEnqueueToCloud: selectedContainerCache was filled with " + selectedContainerCache.size() + " entries");
if (selectedContainerCache == null || selectedContainerCache.isEmpty()) {
this.log.logInfo("selectContainersEnqueueToCloud: selectedContainerCache is empty, cannot do anything here.");
return false;
@ -344,7 +343,7 @@ public class Dispatcher {
List<ReferenceContainer<WordReference>>[] splitContainerCache;
try {
splitContainerCache = splitContainers(selectedContainerCache);
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
this.log.logSevere("selectContainersEnqueueToCloud: splitContainers failed because of too low RAM", e);
return false;
}
@ -363,7 +362,7 @@ public class Dispatcher {
this.log.logInfo("selectContainersEnqueueToCloud: splitContainerCache enqueued to cloud array which has now " + this.transmissionCloud.size() + " entries.");
return true;
}
/**
* PROCESS(5)
* take the largest container from the cloud and put it into the 'next' array,
@ -371,43 +370,50 @@ public class Dispatcher {
* This method returns true if a container was dequeued, false if not
*/
public boolean dequeueContainer() {
if (transmissionCloud == null) return false;
if (this.indexingTransmissionProcessor.queueSize() > indexingTransmissionProcessor.concurrency()) return false;
if (this.transmissionCloud == null) return false;
if (this.indexingTransmissionProcessor.queueSize() > this.indexingTransmissionProcessor.concurrency()) return false;
ByteArray maxtarget = null;
int maxsize = -1;
for (Map.Entry<ByteArray, Transmission.Chunk> chunk: this.transmissionCloud.entrySet()) {
for (final Map.Entry<ByteArray, Transmission.Chunk> chunk: this.transmissionCloud.entrySet()) {
if (chunk.getValue().containersSize() > maxsize) {
maxsize = chunk.getValue().containersSize();
maxtarget = chunk.getKey();
}
}
if (maxsize < 0) return false;
Transmission.Chunk chunk = this.transmissionCloud.remove(maxtarget);
final Transmission.Chunk chunk = this.transmissionCloud.remove(maxtarget);
try {
this.indexingTransmissionProcessor.enQueue(chunk);
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
return true;
}
public Transmission.Chunk transferDocumentIndex(Transmission.Chunk chunk) {
/**
* transfer job: this method is called using reflection from the switchboard
* the method is called as a Workflow process. That means it is always called whenever
* a job is placed in the workflow queue. This happens in dequeueContainer()
* @param chunk
* @return
*/
public Transmission.Chunk transferDocumentIndex(final Transmission.Chunk chunk) {
// do the transmission
boolean success = chunk.transmit();
final boolean success = chunk.transmit();
if (success && chunk.isFinished()) {
// finished with this queue!
this.log.logInfo("STORE: Chunk " + ASCII.String(chunk.primaryTarget()) + " has FINISHED all transmissions!");
return chunk;
}
if (!success) this.log.logInfo("STORE: Chunk " + ASCII.String(chunk.primaryTarget()) + " has failed to transmit index; marked peer as busy");
if (chunk.canFinish()) {
try {
if (this.indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.enQueue(chunk);
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
return null;
}
@ -420,12 +426,12 @@ public class Dispatcher {
public void close() {
// removes all entries from the dispatcher and puts them back to a RAMRI
if (indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.announceShutdown();
if (this.indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.announceShutdown();
if (this.transmissionCloud != null) {
outerLoop: for (Map.Entry<ByteArray, Transmission.Chunk> e : this.transmissionCloud.entrySet()) {
for (ReferenceContainer<WordReference> i : e.getValue()) try {
outerLoop: for (final Map.Entry<ByteArray, Transmission.Chunk> e : this.transmissionCloud.entrySet()) {
for (final ReferenceContainer<WordReference> i : e.getValue()) try {
this.segment.termIndex().add(i);
} catch (Exception e1) {
} catch (final Exception e1) {
Log.logException(e1);
break outerLoop;
}
@ -433,11 +439,11 @@ public class Dispatcher {
this.transmissionCloud.clear();
}
this.transmissionCloud = null;
if (indexingTransmissionProcessor != null) {
if (this.indexingTransmissionProcessor != null) {
this.indexingTransmissionProcessor.awaitShutdown(10000);
this.indexingTransmissionProcessor.clear();
}
this.indexingTransmissionProcessor = null;
}
}

@ -1,4 +1,4 @@
//SimpleLogFormatter.java
//SimpleLogFormatter.java
//-------------------------------------
//part of YACY
//(C) by Michael Peter Christen; mc@yacy.net
@ -50,13 +50,13 @@ public final class SimpleLogFormatter extends SimpleFormatter {
public SimpleLogFormatter() {
super();
}
@Override
public final synchronized String format(final LogRecord record) {
final StringBuffer stringBuffer = this.buffer;
stringBuffer.setLength(0);
// adding the loglevel
final int logLevel = record.getLevel().intValue();
if (logLevel == Log.LOGLEVEL_SEVERE)
@ -67,16 +67,16 @@ public final class SimpleLogFormatter extends SimpleFormatter {
this.buffer.append(Log.LOGTOKEN_CONFIG);
else if (logLevel == Log.LOGLEVEL_INFO)
this.buffer.append(Log.LOGTOKEN_INFO);
else if (logLevel == Log.LOGLEVEL_FINE)
else if (logLevel == Log.LOGLEVEL_FINE)
this.buffer.append(Log.LOGTOKEN_FINE);
else if (logLevel == Log.LOGLEVEL_FINER)
this.buffer.append(Log.LOGTOKEN_FINER);
else if (logLevel == Log.LOGLEVEL_FINEST)
this.buffer.append(Log.LOGTOKEN_FINEST);
else
else if (logLevel == Log.LOGLEVEL_FINER)
this.buffer.append(Log.LOGTOKEN_FINER);
else if (logLevel == Log.LOGLEVEL_FINEST)
this.buffer.append(Log.LOGTOKEN_FINEST);
else
this.buffer.append(Log.LOGTOKEN_FINE);
this.buffer.append(' ');
// adding the logging date
this.date.setTime(record.getMillis());
this.position.setBeginIndex(0);
@ -85,11 +85,11 @@ public final class SimpleLogFormatter extends SimpleFormatter {
// adding the logger name
stringBuffer.append(' ');
stringBuffer.append(record.getLoggerName());
// adding the logging message
stringBuffer.append(' ');
stringBuffer.append(formatMessage(record));
// adding the stack trace if available
stringBuffer.append(System.getProperty("line.separator"));
if (record.getThrown() != null) {

@ -464,17 +464,17 @@ public class Table implements Index, Iterable<Row.Entry> {
final int i = (int) this.index.get(key);
if (i == -1) return null;
final byte[] b = new byte[this.rowdef.objectsize];
if (this.table == null) {
final Row.Entry cacherow;
if (this.table == null || (cacherow = this.table.get(i, false)) == null) {
// read row from the file
this.file.get(i, b, 0);
} else {
// construct the row using the copy in RAM
final Row.Entry v = this.table.get(i, false);
assert v != null;
if (v == null) return null;
assert cacherow != null;
if (cacherow == null) return null;
assert key.length == this.rowdef.primaryKeyLength;
System.arraycopy(key, 0, b, 0, key.length);
System.arraycopy(v.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength);
System.arraycopy(cacherow.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength);
}
return this.rowdef.newEntry(b);
}
@ -503,7 +503,7 @@ public class Table implements Index, Iterable<Row.Entry> {
assert this.table == null || this.table.size() == this.index.size() : "table.size() = " + this.table.size() + ", index.size() = " + this.index.size();
assert row != null;
assert row.bytes() != null;
if ((row == null) || (row.bytes() == null)) return null;
if (row == null || row.bytes() == null) return null;
final int i = (int) this.index.get(row.getPrimaryKeyBytes());
if (i == -1) {
try {
@ -517,17 +517,17 @@ public class Table implements Index, Iterable<Row.Entry> {
}
final byte[] b = new byte[this.rowdef.objectsize];
if (this.table == null) {
Row.Entry cacherow;
if (this.table == null || (cacherow = this.table.get(i, false)) == null) {
// read old value
this.file.get(i, b, 0);
// write new value
this.file.put(i, row.bytes(), 0);
} else {
// read old value
final Row.Entry v = this.table.get(i, false);
assert v != null;
assert cacherow != null;
System.arraycopy(row.getPrimaryKeyBytes(), 0, b, 0, this.rowdef.primaryKeyLength);
System.arraycopy(v.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength);
System.arraycopy(cacherow.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength);
// write new value
try {
this.table.set(i, this.taildef.newEntry(row.bytes(), this.rowdef.primaryKeyLength, true));
@ -573,13 +573,12 @@ public class Table implements Index, Iterable<Row.Entry> {
this.file.put(i, row.bytes(), 0);
} else {
// write new value
try {
this.file.put(i, row.bytes(), 0);
if (abandonTable()) this.table = null; else try {
this.table.set(i, this.taildef.newEntry(row.bytes(), this.rowdef.primaryKeyLength, true));
} catch (final RowSpaceExceededException e) {
this.table = null;
}
if (abandonTable()) this.table = null;
this.file.put(i, row.bytes(), 0);
}
assert this.file.size() == this.index.size() : "file.size() = " + this.file.size() + ", index.size() = " + this.index.size();
assert this.table == null || this.table.size() == this.index.size() : "table.size() = " + this.table.size() + ", index.size() = " + this.index.size();
@ -665,7 +664,8 @@ public class Table implements Index, Iterable<Row.Entry> {
final int sb = this.index.size();
int ix;
assert i < this.index.size();
if (this.table == null) {
final Row.Entry cacherow;
if (this.table == null || (cacherow = this.table.get(i, false)) == null) {
if (i == this.index.size() - 1) {
// element is at last entry position
ix = (int) this.index.remove(key);
@ -697,9 +697,8 @@ public class Table implements Index, Iterable<Row.Entry> {
assert this.file.size() == this.index.size() : "file.size() = " + this.file.size() + ", index.size() = " + this.index.size();
} else {
// get result value from the table copy, so we don't need to read it from the file
final Row.Entry v = this.table.get(i, false);
System.arraycopy(key, 0, b, 0, key.length);
System.arraycopy(v.bytes(), 0, b, this.rowdef.primaryKeyLength, this.taildef.objectsize);
System.arraycopy(cacherow.bytes(), 0, b, this.rowdef.primaryKeyLength, this.taildef.objectsize);
if (i == this.index.size() - 1) {
// special handling if the entry is the last entry in the file
@ -911,7 +910,8 @@ public class Table implements Index, Iterable<Row.Entry> {
this.c = (int) Table.this.index.get(k);
if (this.c < 0) throw new ConcurrentModificationException(); // this should only happen if the table was modified during the iteration
final byte[] b = new byte[Table.this.rowdef.objectsize];
if (Table.this.table == null) {
final Row.Entry cacherow;
if (Table.this.table == null || (cacherow = Table.this.table.get(this.c, false)) == null) {
// read from file
try {
Table.this.file.get(this.c, b, 0);
@ -921,11 +921,10 @@ public class Table implements Index, Iterable<Row.Entry> {
}
} else {
// compose from table and key
final Row.Entry v = Table.this.table.get(this.c, false);
assert v != null;
if (v == null) return null;
assert cacherow != null;
if (cacherow == null) return null;
System.arraycopy(k, 0, b, 0, Table.this.rowdef.primaryKeyLength);
System.arraycopy(v.bytes(), 0, b, Table.this.rowdef.primaryKeyLength, Table.this.taildef.objectsize);
System.arraycopy(cacherow.bytes(), 0, b, Table.this.rowdef.primaryKeyLength, Table.this.taildef.objectsize);
}
return Table.this.rowdef.newEntry(b);
}

Loading…
Cancel
Save