diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index b0cbb88c5..a68b4d865 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -61,10 +61,9 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.rwi.ReferenceContainerArray; import net.yacy.kelondro.util.MemoryControl; - import de.anomic.search.MetadataRepository; -import de.anomic.search.Segment; import de.anomic.search.MetadataRepository.Export; +import de.anomic.search.Segment; public class URLAnalysis { @@ -78,21 +77,21 @@ public class URLAnalysis { static { try { poison = new DigestURI("http://poison.org/poison"); - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { poison = null; } } - + public static class splitter extends Thread { - - private ArrayBlockingQueue in; - private ConcurrentHashMap out; + + private final ArrayBlockingQueue in; + private final ConcurrentHashMap out; public splitter(final ArrayBlockingQueue in, final ConcurrentHashMap out) { this.in = in; this.out = out; } - + @Override public void run() { try { @@ -100,29 +99,29 @@ public class URLAnalysis { final Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_"); while (true) { try { - url = in.take(); + url = this.in.take(); if (url == poison) break; update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\.")); update(p.matcher(url.getPath()).replaceAll("/").split("/")); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } } - } catch (Exception e) { + } catch (final Exception e) { Log.logException(e); } } - + private void update(final String[] s) { Integer c; for (final String t: s) { if (t.length() == 0) continue; - c = out.get(t); - out.put(t, (c == null) ? 1 : c.intValue() + 1); + c = this.out.get(t); + this.out.put(t, (c == null) ? 1 : c.intValue() + 1); } } } - + public static void cleanup(final ConcurrentHashMap stat) { Map.Entry entry; int c, low = Integer.MAX_VALUE; @@ -146,7 +145,7 @@ public class URLAnalysis { } Runtime.getRuntime().gc(); } - + public static void genstat(final String urlfile) { final boolean gz = urlfile.endsWith(".gz"); @@ -165,7 +164,7 @@ public class URLAnalysis { final File outfile = new File(analysis); BufferedReader reader = null; long time = System.currentTimeMillis(); - long start = time; + final long start = time; int count = 0; System.out.println("start processing"); @@ -178,11 +177,11 @@ public class URLAnalysis { line = line.trim(); if (line.length() > 0) { try { - DigestURI url = new DigestURI(line); + final DigestURI url = new DigestURI(line); in.put(url); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { continue; } } @@ -208,15 +207,15 @@ public class URLAnalysis { System.out.println("stopping threads"); for (int i = 0, available = Runtime.getRuntime().availableProcessors() + 1; i < available; i++) try { in.put(poison); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } try { spl.join(); - } catch (InterruptedException e1) { + } catch (final InterruptedException e1) { Log.logException(e1); } - + // generate statistics System.out.println("start processing results"); final TreeMap results = new TreeMap(); @@ -233,7 +232,7 @@ public class URLAnalysis { System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left"); } } - + // write statistics System.out.println("start writing results"); try { @@ -252,13 +251,13 @@ public class URLAnalysis { } } os.close(); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } - + System.out.println("finished"); } - + public static void genhost(final String urlfile) { final boolean gz = urlfile.endsWith(".gz"); @@ -280,9 +279,9 @@ public class URLAnalysis { line = line.trim(); if (line.length() > 0) { try { - DigestURI url = new DigestURI(line); + final DigestURI url = new DigestURI(line); hosts.add(url.getHost()); - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { continue; } } @@ -298,7 +297,7 @@ public class URLAnalysis { } finally { if (reader != null) try { reader.close(); } catch (final Exception e) {} } - + // copy everything into a TreeSet to order it System.out.println("start processing results"); final TreeSet results = new TreeSet(); @@ -313,18 +312,18 @@ public class URLAnalysis { System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left"); } } - + // write hosts writeSet(trunk, gz, results); - + System.out.println("finished"); } - + private static void writeSet(final String trunk, final boolean gz, final Set set) { // write hosts System.out.println("start writing results"); - File outfile = new File(trunk + ((gz) ? ".gz" : "")); + final File outfile = new File(trunk + ((gz) ? ".gz" : "")); long time = System.currentTimeMillis(); try { OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile)); @@ -340,13 +339,13 @@ public class URLAnalysis { } } os.close(); - } catch (IOException e) { + } catch (final IOException e) { Log.logException(e); } System.out.println("finished writing results"); } - + public static void sortsplit(final String urlfile) { final boolean gz = urlfile.endsWith(".gz"); @@ -370,9 +369,9 @@ public class URLAnalysis { line = line.trim(); if (line.length() > 0) { try { - DigestURI url = new DigestURI(line); + final DigestURI url = new DigestURI(line); urls.add(url.toNormalform(true, true)); - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { continue; } } @@ -394,13 +393,13 @@ public class URLAnalysis { } finally { if (reader != null) try { reader.close(); } catch (final Exception e) {} } - + // write hosts writeSet(trunk + "." + filecount, gz, urls); System.out.println("finished"); } - + public static void incell(final File cellPath, final String statisticPath) { try { final HandleMap idx = ReferenceContainerArray.referenceHashes( @@ -412,7 +411,7 @@ public class URLAnalysis { idx.dump(new File(statisticPath)); System.out.println("INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath); idx.close(); - } catch (Exception e) { + } catch (final Exception e) { Log.logException(e); } } @@ -444,7 +443,7 @@ public class URLAnalysis { System.out.println("INDEX DIFF URL-COL finished dump, wrote " + count + " references that occur in the URL-DB, but not in the collection-dump"); return count; } - + public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException { // format: 0=text, 1=html, 2=rss/xml System.out.println("URL EXPORT startup"); @@ -454,12 +453,12 @@ public class URLAnalysis { final Export e = mr.export(new File(export), ".*", hs, format, false); try { e.join(); - } catch (InterruptedException e1) { + } catch (final InterruptedException e1) { Log.logException(e1); } System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries"); } - + public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException { System.out.println("URL DELETE startup"); final MetadataRepository mr = new MetadataRepository(new File(metadataPath), "text.urlmd", false, false); @@ -471,12 +470,12 @@ public class URLAnalysis { } System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database"); } - + public static void main(final String[] args) { if (args[0].equals("-stat") && args.length >= 2) { // generate a statistics about common words in file, store to .stat // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz + // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz for (int i = 1; i < args.length; i++) genstat(args[i]); } else if (args[0].equals("-host") && args.length >= 2) { // generate a file .host containing only the hosts of the urls @@ -495,7 +494,7 @@ public class URLAnalysis { // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT/METADATA used.dump diffurlcol.dump try { diffurlcol(args[1], args[2], args[3]); - } catch (Exception e) { + } catch (final Exception e) { Log.logException(e); } } else if (args[0].equals("-export") && args.length >= 4) { @@ -503,10 +502,10 @@ public class URLAnalysis { // example: // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html' - int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0; + final int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0; try { export(args[1], format, args[3], (args.length >= 5) ? args[4] : null); - } catch (Exception e) { + } catch (final Exception e) { Log.logException(e); } } else if (args[0].equals("-delete") && args.length >= 3) { @@ -516,7 +515,7 @@ public class URLAnalysis { // instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html' try { delete(args[1], args[2]); - } catch (Exception e) { + } catch (final Exception e) { Log.logException(e); } } else { @@ -553,9 +552,9 @@ public class URLAnalysis { } System.exit(0); // kill remaining threads } - + private static final String num(final int i) { - StringBuffer s = new StringBuffer(Integer.toString(i)); + final StringBuilder s = new StringBuilder(Integer.toString(i)); while (s.length() < 9) s.insert(0, "0"); return s.toString(); } diff --git a/source/de/anomic/data/ymark/YMarkCrawlStart.java b/source/de/anomic/data/ymark/YMarkCrawlStart.java index 062d9b886..7aeb4aea0 100644 --- a/source/de/anomic/data/ymark/YMarkCrawlStart.java +++ b/source/de/anomic/data/ymark/YMarkCrawlStart.java @@ -9,7 +9,7 @@ // $LastChangedBy: apfelmaennchen $ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -32,39 +32,38 @@ import java.util.Iterator; import java.util.regex.Pattern; import net.yacy.kelondro.blob.Tables; - import de.anomic.data.WorkTables; public class YMarkCrawlStart extends HashMap{ private static final long serialVersionUID = 1L; - private WorkTables worktables; - + private final WorkTables worktables; + public YMarkCrawlStart(final WorkTables worktables) { this.worktables = worktables; } - + public YMarkCrawlStart(final WorkTables worktables, final String url) { this.worktables = worktables; - this.clear(); - this.load(url); + clear(); + load(url); } - - public void load(String url) { + + public void load(final String url) { try { - final StringBuffer buffer = new StringBuffer(500); + final StringBuilder buffer = new StringBuilder(500); buffer.append("^.*crawlingURL=\\Q"); buffer.append(url); buffer.append("\\E?.*"); - final Pattern pattern = Pattern.compile(buffer.toString()); + final Pattern pattern = Pattern.compile(buffer.toString()); final Iterator APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern); - Tables.Row row = null; + Tables.Row row = null; while(APIcalls.hasNext()) { row = APIcalls.next(); if(row.get(WorkTables.TABLE_API_COL_TYPE, "").equals("crawler")) { buffer.setLength(0); buffer.append(row.get(WorkTables.TABLE_API_COL_URL, "")); - buffer.delete(0, buffer.indexOf("?")+1); + buffer.delete(0, buffer.indexOf("?")+1); int start = 0; int end = 0; String key; @@ -78,12 +77,12 @@ public class YMarkCrawlStart extends HashMap{ end = buffer.length()-1; value = buffer.substring(start, end); start = end+1; - this.put(key, value); + put(key, value); } break; - } + } } - } catch (IOException e) { + } catch (final IOException e) { // TODO Auto-generated catch block } } diff --git a/source/de/anomic/data/ymark/YMarkTables.java b/source/de/anomic/data/ymark/YMarkTables.java index 22d3585b3..6f8282b4e 100644 --- a/source/de/anomic/data/ymark/YMarkTables.java +++ b/source/de/anomic/data/ymark/YMarkTables.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -40,48 +40,48 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import de.anomic.data.WorkTables; public class YMarkTables { - + public static enum TABLES { BOOKMARKS ("_bookmarks"), TAGS ("_tags"), FOLDERS ("_folders"); - + private String basename; - - private TABLES(String b) { + + private TABLES(final String b) { this.basename = b; } public String basename() { return this.basename; } - public String tablename(String bmk_user) { + public String tablename(final String bmk_user) { return bmk_user+this.basename; } } - + public static enum PROTOCOLS { HTTP ("http://"), HTTPS ("https://"); - + private String protocol; - - private PROTOCOLS(String s) { + + private PROTOCOLS(final String s) { this.protocol = s; } public String protocol() { return this.protocol; } - public String protocol(String s) { + public String protocol(final String s) { return this.protocol+s; } } - - public final static String FOLDERS_ROOT = "/"; + + public final static String FOLDERS_ROOT = "/"; public final static String BOOKMARKS_LOG = "BOOKMARKS"; public final static String USER_ADMIN = "admin"; public final static String USER_AUTHENTICATE = "AUTHENTICATE"; public final static String USER_AUTHENTICATE_MSG = "Authentication required!"; - + public final static String p1 = "(?:^|.*,)"; public final static String p2 = "\\Q"; public final static String p3 = "\\E"; @@ -90,15 +90,15 @@ public class YMarkTables { public final static String p6 = "),.*){"; public final static String p7 = "/.*)"; public final static String p8 = "(?:,|$)"; - + public final static int BUFFER_LENGTH = 256; - + private final WorkTables worktables; - + public YMarkTables(final Tables wt) { this.worktables = (WorkTables)wt; } - + public void deleteBookmark(final String bmk_user, final byte[] urlHash) throws IOException, RowSpaceExceededException { final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); Tables.Row bmk_row = null; @@ -107,11 +107,11 @@ public class YMarkTables { this.worktables.delete(bmk_table,urlHash); } } - + public void deleteBookmark(final String bmk_user, final String url) throws IOException, RowSpaceExceededException { this.deleteBookmark(bmk_user, YMarkUtil.getBookmarkId(url)); } - + public TreeMap getTags(final Iterator rowIterator) { final TreeMap tags = new TreeMap(); Tables.Row bmk_row = null; @@ -133,14 +133,14 @@ public class YMarkTables { } return tags; } - + public TreeMap getTags(final String bmk_user) throws IOException { final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); final TreeMap tags = getTags(this.worktables.iterator(bmk_table)); return tags; } - - + + public TreeSet getFolders(final String bmk_user, final String root) throws IOException { final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); final TreeSet folders = new TreeSet(); @@ -155,11 +155,11 @@ public class YMarkTables { final Pattern r = Pattern.compile(patternBuilder.toString()); final Iterator bit = this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.FOLDERS.key(), r); Tables.Row bmk_row = null; - + while(bit.hasNext()) { bmk_row = bit.next(); - if(bmk_row.containsKey(YMarkEntry.BOOKMARK.FOLDERS.key())) { - final String[] folderArray = (new String(bmk_row.get(YMarkEntry.BOOKMARK.FOLDERS.key()),"UTF8")).split(YMarkUtil.TAGS_SEPARATOR); + if(bmk_row.containsKey(YMarkEntry.BOOKMARK.FOLDERS.key())) { + final String[] folderArray = (new String(bmk_row.get(YMarkEntry.BOOKMARK.FOLDERS.key()),"UTF8")).split(YMarkUtil.TAGS_SEPARATOR); for (final String folder : folderArray) { if(folder.length() > root.length() && folder.substring(0, root.length()+1).equals(root+'/')) { if(!folders.contains(folder)) { @@ -167,21 +167,21 @@ public class YMarkTables { path.append(folder); //TODO: get rid of .toString.equals() while(path.length() > 0 && !path.toString().equals(root)){ - folders.add(path.toString()); + folders.add(path.toString()); path.setLength(path.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR)); - } + } } - } + } } } } - if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); } + if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); } return folders; } - - public Iterator getBookmarksByFolder(final String bmk_user, final String folder) throws IOException { + + public Iterator getBookmarksByFolder(final String bmk_user, final String folder) throws IOException { final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); - final StringBuffer patternBuilder = new StringBuffer(BUFFER_LENGTH); + final StringBuilder patternBuilder = new StringBuilder(BUFFER_LENGTH); patternBuilder.setLength(0); patternBuilder.append(p1); patternBuilder.append('('); @@ -193,10 +193,10 @@ public class YMarkTables { final Pattern p = Pattern.compile(patternBuilder.toString()); return this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.FOLDERS.key(), p); } - - public Iterator getBookmarksByTag(final String bmk_user, final String[] tagArray) throws IOException { + + public Iterator getBookmarksByTag(final String bmk_user, final String[] tagArray) throws IOException { final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); - final StringBuffer patternBuilder = new StringBuffer(BUFFER_LENGTH); + final StringBuilder patternBuilder = new StringBuilder(BUFFER_LENGTH); patternBuilder.setLength(0); patternBuilder.append(p1); patternBuilder.append(p5); @@ -213,9 +213,9 @@ public class YMarkTables { final Pattern p = Pattern.compile(patternBuilder.toString()); return this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.TAGS.key(), p); } - + public SortedSet orderBookmarksBy(final Iterator rowIterator, final String sortname, final String sortorder) { - TreeSet sortTree = new TreeSet(new TablesRowComparator(sortname)); + final TreeSet sortTree = new TreeSet(new TablesRowComparator(sortname)); Row row; while (rowIterator.hasNext()) { row = rowIterator.next(); @@ -226,36 +226,36 @@ public class YMarkTables { return sortTree.descendingSet(); return sortTree; } - + public void addTags(final String bmk_user, final String url, final String tagString, final boolean merge) throws IOException, RowSpaceExceededException { if(!tagString.isEmpty()) { // do not set defaults as we only want to update tags final YMarkEntry bmk = new YMarkEntry(false); - bmk.put(YMarkEntry.BOOKMARK.URL.key(), url); + bmk.put(YMarkEntry.BOOKMARK.URL.key(), url); bmk.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkUtil.cleanTagsString(tagString)); - this.addBookmark(bmk_user, bmk, merge, true); - } + addBookmark(bmk_user, bmk, merge, true); + } } - + public void addFolder(final String bmk_user, final String url, final String folder) throws IOException, RowSpaceExceededException { if(!folder.isEmpty()) { // do not set defaults as we only want to add a folder final YMarkEntry bmk = new YMarkEntry(false); - bmk.put(YMarkEntry.BOOKMARK.URL.key(), url); + bmk.put(YMarkEntry.BOOKMARK.URL.key(), url); bmk.put(YMarkEntry.BOOKMARK.FOLDERS.key(), folder); - this.addBookmark(bmk_user, bmk, true, true); - } + addBookmark(bmk_user, bmk, true, true); + } } - + public void visited(final String bmk_user, final String url) throws IOException, RowSpaceExceededException { // do not set defaults final YMarkEntry bmk = new YMarkEntry(false); - bmk.put(YMarkEntry.BOOKMARK.URL.key(), url); + bmk.put(YMarkEntry.BOOKMARK.URL.key(), url); bmk.put(YMarkEntry.BOOKMARK.DATE_VISITED.key(), (new YMarkDate()).toString()); - this.addBookmark(bmk_user, bmk, true, true); + addBookmark(bmk_user, bmk, true, true); } - - + + public void addBookmark(final String bmk_user, final YMarkEntry bmk, final boolean mergeTags, final boolean mergeFolders) throws IOException, RowSpaceExceededException { final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user); final String date = String.valueOf(System.currentTimeMillis()); @@ -267,19 +267,19 @@ public class YMarkTables { if (bmk_row == null) { // create and insert new entry this.worktables.insert(bmk_table, urlHash, bmk.getData()); - } else { + } else { // modify and update existing entry HashSet oldSet; HashSet newSet; - - for (YMarkEntry.BOOKMARK b : YMarkEntry.BOOKMARK.values()) { + + for (final YMarkEntry.BOOKMARK b : YMarkEntry.BOOKMARK.values()) { switch(b) { case DATE_ADDED: if(!bmk_row.containsKey(b.key())) - bmk_row.put(b.key(), date); + bmk_row.put(b.key(), date); break; case DATE_MODIFIED: - bmk_row.put(b.key(), date); + bmk_row.put(b.key(), date); break; case TAGS: oldSet = YMarkUtil.keysStringToSet(bmk_row.get(b.key(),b.deflt())); @@ -295,7 +295,7 @@ public class YMarkTables { } } else { bmk_row.put(b.key(), bmk_row.get(b.key(), b.deflt())); - } + } break; case FOLDERS: oldSet = YMarkUtil.keysStringToSet(bmk_row.get(b.key(),b.deflt())); @@ -312,7 +312,7 @@ public class YMarkTables { } else { bmk_row.put(b.key(), bmk_row.get(b.key(), b.deflt())); } - break; + break; default: if(bmk.containsKey(b.key())) { bmk_row.put(b.key(), bmk.get(b.key())); @@ -322,7 +322,7 @@ public class YMarkTables { } } // update bmk_table - this.worktables.update(bmk_table, bmk_row); + this.worktables.update(bmk_table, bmk_row); } } } diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index 2d967a531..fa46f3e73 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -1377,7 +1377,7 @@ public final class HTTPDFileHandler { final Pattern p = Pattern.compile("(href=\"|src=\")([^\"]+)|(href='|src=')([^']+)|(url\\(')([^']+)|(url\\(\")([^\"]+)|(url\\()([^\\)]+)"); final Matcher m = p.matcher(sbuffer); - final StringBuffer result = new StringBuffer(); + final StringBuffer result = new StringBuffer(80); while (m.find()) { String init = null; if(m.group(1) != null) init = m.group(1); diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 3d2a142d5..61fca8073 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -37,12 +37,13 @@ import java.util.TreeSet; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.Scanner; -import net.yacy.cora.storage.ConcurrentScoreMap; import net.yacy.cora.storage.ClusteredScoreMap; +import net.yacy.cora.storage.ConcurrentScoreMap; import net.yacy.cora.storage.ScoreMap; import net.yacy.cora.storage.WeakPriorityBlockingQueue; import net.yacy.cora.storage.WeakPriorityBlockingQueue.ReverseElement; @@ -55,13 +56,12 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.TermSearch; import net.yacy.kelondro.util.EventTracker; - import de.anomic.yacy.graphics.ProfilingGraph; public final class RankingProcess extends Thread { - + private static final int maxDoubleDomAll = 1000, maxDoubleDomSpecial = 10000; - + private final QueryParams query; private final SortedSet urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private final int[] flagcount; // flag counter @@ -69,14 +69,14 @@ public final class RankingProcess extends Thread { private int sortout; // counter for referenced that had been sorted out for other reasons //private final int[] domZones; private SortedMap> localSearchInclusion; - + private int remote_resourceSize, remote_indexCount, remote_peerCount; private int local_resourceSize, local_indexCount; private final WeakPriorityBlockingQueue stack; private int feeders; private final ConcurrentHashMap> doubleDomCache; // key = domhash (6 bytes); value = like stack //private final HandleSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process - + private final ScoreMap ref; // reference score computation for the commonSense heuristic private final ScoreMap hostNavigator; // a counter for the appearance of the host hash private final Map hostResolver; // a mapping from a host hash (6 bytes) to the full url hash of one of these urls that have the host hash @@ -85,7 +85,7 @@ public final class RankingProcess extends Thread { private final ReferenceOrder order; private final long startTime; private boolean addRunning; - + public RankingProcess(final QueryParams query, final ReferenceOrder order, final int maxentries) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime @@ -116,36 +116,36 @@ public final class RankingProcess extends Thread { this.feeders = 1; this.startTime = System.currentTimeMillis(); } - + public QueryParams getQuery() { return this.query; } - + public ReferenceOrder getOrder() { return this.order; } - + @Override public void run() { // do a search - + // sort the local containers and truncate it to a limited count, // so following sortings together with the global results will be fast try { - long timer = System.currentTimeMillis(); + final long timer = System.currentTimeMillis(); final TermSearch search = this.query.getSegment().termIndex().query( - query.queryHashes, - query.excludeHashes, + this.query.queryHashes, + this.query.excludeHashes, null, Segment.wordReferenceFactory, - query.maxDistance); + this.query.maxDistance); this.localSearchInclusion = search.inclusion(); final ReferenceContainer index = search.joined(); - EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.JOIN, query.queryString, index.size(), System.currentTimeMillis() - timer), false); + EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.JOIN, this.query.queryString, index.size(), System.currentTimeMillis() - timer), false); if (index.isEmpty()) { return; } - + add(index, true, "local index: " + this.query.getSegment().getLocation(), -1, true); } catch (final Exception e) { Log.logException(e); @@ -153,21 +153,21 @@ public final class RankingProcess extends Thread { oneFeederTerminated(); } } - + public void add( final ReferenceContainer index, final boolean local, - String resourceName, + final String resourceName, final int fullResource, final boolean finalizeAddAtEnd) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime this.addRunning = true; - + assert (index != null); if (index.isEmpty()) return; - + if (local) { this.local_resourceSize += index.size(); } else { @@ -175,16 +175,16 @@ public final class RankingProcess extends Thread { this.remote_resourceSize += fullResource; this.remote_peerCount++; } - + long timer = System.currentTimeMillis(); - + // normalize entries final BlockingQueue decodedEntries = this.order.normalizeWith(index); - EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.NORMALIZING, resourceName, index.size(), System.currentTimeMillis() - timer), false); - + EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.NORMALIZING, resourceName, index.size(), System.currentTimeMillis() - timer), false); + // iterate over normalized entries and select some that are better than currently stored timer = System.currentTimeMillis(); - boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0; + final boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0; // apply all constraints try { @@ -197,7 +197,7 @@ public final class RankingProcess extends Thread { // increase flag counts for (int j = 0; j < 32; j++) { - if (iEntry.flags().get(j)) {flagcount[j]++;} + if (iEntry.flags().get(j)) {this.flagcount[j]++;} } // check constraints @@ -206,11 +206,11 @@ public final class RankingProcess extends Thread { } // check document domain - if (query.contentdom != ContentDomain.TEXT) { - if ((query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue; } - if ((query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue; } - if ((query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue; } - if ((query.contentdom == ContentDomain.APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) { continue; } + if (this.query.contentdom != ContentDomain.TEXT) { + if ((this.query.contentdom == ContentDomain.AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) { continue; } + if ((this.query.contentdom == ContentDomain.VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) { continue; } + if ((this.query.contentdom == ContentDomain.IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) { continue; } + if ((this.query.contentdom == ContentDomain.APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) { continue; } } // check tld domain @@ -221,32 +221,32 @@ public final class RankingProcess extends Thread { continue; } */ - + // count domZones //this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++; - + // check site constraints - String hosthash = iEntry.hosthash(); - if (query.sitehash == null) { + final String hosthash = iEntry.hosthash(); + if (this.query.sitehash == null) { // no site constraint there; maybe collect host navigation information - if (nav_hosts && query.urlMask_isCatchall) { + if (nav_hosts && this.query.urlMask_isCatchall) { this.hostNavigator.inc(hosthash); this.hostResolver.put(hosthash, iEntry.urlhash()); } } else { - if (!hosthash.equals(query.sitehash)) { + if (!hosthash.equals(this.query.sitehash)) { // filter out all domains that do not match with the site constraint continue; } } - + // finally make a double-check and insert result to stack - if (urlhashes.add(iEntry.urlhash())) { + if (this.urlhashes.add(iEntry.urlhash())) { rankingtryloop: while (true) { try { - stack.put(new ReverseElement(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest) + this.stack.put(new ReverseElement(iEntry, this.order.cardinal(iEntry))); // inserts the element and removes the worst (which is smallest) break rankingtryloop; - } catch (ArithmeticException e) { + } catch (final ArithmeticException e) { // this may happen if the concurrent normalizer changes values during cardinal computation continue rankingtryloop; } @@ -256,14 +256,14 @@ public final class RankingProcess extends Thread { } } - } catch (InterruptedException e) {} finally { + } catch (final InterruptedException e) {} finally { if (finalizeAddAtEnd) this.addRunning = false; } - + //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true); - EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.PRESORT, resourceName, index.size(), System.currentTimeMillis() - timer), false); + EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.PRESORT, resourceName, index.size(), System.currentTimeMillis() - timer), false); } - + /** * method to signal the incoming stack that one feeder has terminated */ @@ -271,56 +271,56 @@ public final class RankingProcess extends Thread { this.feeders--; assert this.feeders >= 0 : "feeders = " + this.feeders; } - + protected void moreFeeders(final int countMoreFeeders) { this.feeders += countMoreFeeders; } - + public boolean feedingIsFinished() { return System.currentTimeMillis() - this.startTime > 50 && this.feeders == 0; } - + private boolean testFlags(final WordReference ientry) { - if (query.constraint == null) return true; + if (this.query.constraint == null) return true; // test if ientry matches with filter // if all = true: let only entries pass that has all matching bits // if all = false: let all entries pass that has at least one matching bit - if (query.allofconstraint) { + if (this.query.allofconstraint) { for (int i = 0; i < 32; i++) { - if ((query.constraint.get(i)) && (!ientry.flags().get(i))) return false; + if ((this.query.constraint.get(i)) && (!ientry.flags().get(i))) return false; } return true; } for (int i = 0; i < 32; i++) { - if ((query.constraint.get(i)) && (ientry.flags().get(i))) return true; + if ((this.query.constraint.get(i)) && (ientry.flags().get(i))) return true; } return false; } - + protected Map> searchContainerMap() { // direct access to the result maps is needed for abstract generation // this is only available if execQuery() was called before - return localSearchInclusion; + return this.localSearchInclusion; } - + private WeakPriorityBlockingQueue.Element takeRWI(final boolean skipDoubleDom, final long waitingtime) { - + // returns from the current RWI list the best entry and removes this entry from the list WeakPriorityBlockingQueue m; WeakPriorityBlockingQueue.Element rwi = null; - + // take one entry from the stack if there are entries on that stack or the feeding is not yet finished try { //System.out.println("stack.poll: feeders = " + this.feeders + ", stack.sizeQueue = " + stack.sizeQueue()); int loops = 0; // a loop counter to terminate the reading if all the results are from the same domain - long timeout = System.currentTimeMillis() + waitingtime; - while (((!feedingIsFinished() && this.addRunning) || stack.sizeQueue() > 0) && + final long timeout = System.currentTimeMillis() + waitingtime; + while (((!feedingIsFinished() && this.addRunning) || this.stack.sizeQueue() > 0) && (this.query.itemsPerPage < 1 || loops++ < this.query.itemsPerPage)) { if (waitingtime <= 0) { - rwi = stack.poll(); + rwi = this.stack.poll(); } else timeoutloop:while (System.currentTimeMillis() < timeout) { - if (feedingIsFinished() && stack.sizeQueue() == 0) break timeoutloop; - rwi = stack.poll(50); + if (feedingIsFinished() && this.stack.sizeQueue() == 0) break timeoutloop; + rwi = this.stack.poll(50); if (rwi != null) break timeoutloop; } if (rwi == null) break; @@ -328,14 +328,14 @@ public final class RankingProcess extends Thread { //System.out.println("!skipDoubleDom"); return rwi; } - + // check doubledom final String hosthash = rwi.getElement().hosthash(); synchronized (this.doubleDomCache) { m = this.doubleDomCache.get(hosthash); if (m == null) { // first appearance of dom. we create an entry to signal that one of that domain was already returned - m = new WeakPriorityBlockingQueue((query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll); + m = new WeakPriorityBlockingQueue((this.query.specialRights) ? maxDoubleDomSpecial : maxDoubleDomAll); this.doubleDomCache.put(hosthash, m); return rwi; } @@ -343,9 +343,9 @@ public final class RankingProcess extends Thread { m.put(rwi); } } - } catch (InterruptedException e1) {} + } catch (final InterruptedException e1) {} if (this.doubleDomCache.isEmpty()) return null; - + // no more entries in sorted RWI entries. Now take Elements from the doubleDomCache // find best entry from all caches WeakPriorityBlockingQueue.Element bestEntry = null; @@ -355,7 +355,7 @@ public final class RankingProcess extends Thread { while (i.hasNext()) { try { m = i.next(); - } catch (ConcurrentModificationException e) { + } catch (final ConcurrentModificationException e) { Log.logException(e); continue; // not the best solution... } @@ -372,14 +372,14 @@ public final class RankingProcess extends Thread { } } if (bestEntry == null) return null; - + // finally remove the best entry from the doubledom cache m = this.doubleDomCache.get(bestEntry.getElement().hosthash()); bestEntry = m.poll(); } return bestEntry; } - + /** * get one metadata entry from the ranked results. This will be the 'best' entry so far * according to the applied ranking. If there are no more entries left or the timeout @@ -400,26 +400,26 @@ public final class RankingProcess extends Thread { if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi); if (page == null) { - misses.add(obrwi.getElement().urlhash()); + this.misses.add(obrwi.getElement().urlhash()); continue; } - + // prepare values for constraint check final URIMetadataRow.Components metadata = page.metadata(); - + // check errors if (metadata == null) { this.sortout++; continue; // rare case where the url is corrupted } - - if (!query.urlMask_isCatchall) { + + if (!this.query.urlMask_isCatchall) { // check url mask - if (!metadata.matches(query.urlMask)) { + if (!metadata.matches(this.query.urlMask)) { this.sortout++; continue; } - + // in case that we do not have e catchall filter for urls // we must also construct the domain navigator here //if (query.sitehash == null) { @@ -427,7 +427,7 @@ public final class RankingProcess extends Thread { // this.hostResolver.put(UTF8.String(urlhash, 6, 6), UTF8.String(urlhash)); //} } - + // check for more errors if (metadata.url() == null) { this.sortout++; @@ -439,18 +439,18 @@ public final class RankingProcess extends Thread { final String pagetitle = metadata.dc_title().toLowerCase(); // check exclusion - if ((QueryParams.anymatch(pagetitle, query.excludeHashes)) || - (QueryParams.anymatch(pageurl.toLowerCase(), query.excludeHashes)) || - (QueryParams.anymatch(pageauthor.toLowerCase(), query.excludeHashes))) { + if ((QueryParams.anymatch(pagetitle, this.query.excludeHashes)) || + (QueryParams.anymatch(pageurl.toLowerCase(), this.query.excludeHashes)) || + (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.excludeHashes))) { this.sortout++; continue; } // check index-of constraint - if ((query.constraint != null) && - (query.constraint.get(Condenser.flag_cat_indexof)) && + if ((this.query.constraint != null) && + (this.query.constraint.get(Condenser.flag_cat_indexof)) && (!(pagetitle.startsWith("index of")))) { - final Iterator wi = query.queryHashes.iterator(); + final Iterator wi = this.query.queryHashes.iterator(); while (wi.hasNext()) { this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash()); } @@ -459,41 +459,41 @@ public final class RankingProcess extends Thread { } // check location constraint - if ((query.constraint != null) && - (query.constraint.get(Condenser.flag_cat_haslocation)) && + if ((this.query.constraint != null) && + (this.query.constraint.get(Condenser.flag_cat_haslocation)) && (metadata.lat() == 0.0f || metadata.lon() == 0.0f)) { this.sortout++; continue; } - + // check content domain - if ((query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) || - (query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) || - (query.contentdom == ContentDomain.IMAGE && page.limage() == 0) || - (query.contentdom == ContentDomain.APP && page.lapp() == 0)) { + if ((this.query.contentdom == ContentDomain.AUDIO && page.laudio() == 0) || + (this.query.contentdom == ContentDomain.VIDEO && page.lvideo() == 0) || + (this.query.contentdom == ContentDomain.IMAGE && page.limage() == 0) || + (this.query.contentdom == ContentDomain.APP && page.lapp() == 0)) { this.sortout++; continue; } - + // evaluate information of metadata for navigation // author navigation: if (pageauthor != null && pageauthor.length() > 0) { // add author to the author navigator - String authorhash = ASCII.String(Word.word2hash(pageauthor)); + final String authorhash = ASCII.String(Word.word2hash(pageauthor)); // check if we already are filtering for authors if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) { this.sortout++; continue; } - + // add author to the author navigator this.authorNavigator.inc(pageauthor); } else if (this.query.authorhash != null) { this.sortout++; continue; } - + // namespace navigation String pagepath = metadata.url().getPath(); if ((p = pagepath.indexOf(':')) >= 0) { @@ -504,49 +504,49 @@ public final class RankingProcess extends Thread { this.namespaceNavigator.inc(pagepath); } } - + // check Scanner if (!Scanner.acceptURL(metadata.url())) { this.sortout++; continue; } - + // accept url return page; } return null; } - + public int sizeQueue() { - int c = stack.sizeQueue(); - for (WeakPriorityBlockingQueue s: this.doubleDomCache.values()) { + int c = this.stack.sizeQueue(); + for (final WeakPriorityBlockingQueue s: this.doubleDomCache.values()) { c += s.sizeQueue(); } return c; } - + public int sizeAvailable() { - int c = stack.sizeAvailable(); - for (WeakPriorityBlockingQueue s: this.doubleDomCache.values()) { + int c = this.stack.sizeAvailable(); + for (final WeakPriorityBlockingQueue s: this.doubleDomCache.values()) { c += s.sizeAvailable(); } return c; } - + public boolean isEmpty() { - if (!stack.isEmpty()) return false; - for (WeakPriorityBlockingQueue s: this.doubleDomCache.values()) { + if (!this.stack.isEmpty()) return false; + for (final WeakPriorityBlockingQueue s: this.doubleDomCache.values()) { if (!s.isEmpty()) return false; } return true; } - + public int[] flagCount() { - return flagcount; + return this.flagcount; } - + // "results from a total number of known ( local, remote), links from other YaCy peers." - + public int filteredCount() { // the number of index entries that are considered as result set return this.stack.sizeAvailable(); @@ -556,44 +556,44 @@ public final class RankingProcess extends Thread { // the number of results in the local peer after filtering return this.local_indexCount; } - + public int getRemoteIndexCount() { // the number of result contributions from all the remote peers return this.remote_indexCount; } - + public int getRemoteResourceSize() { // the number of all hits in all the remote peers return Math.max(this.remote_resourceSize, this.remote_indexCount); } - + public int getRemotePeerCount() { // the number of remote peers that have contributed return this.remote_peerCount; } - + public Iterator miss() { return this.misses.iterator(); } - + public int getMissCount() { return this.misses.size(); } - + public int getSortOutCount() { return this.sortout; } - + public ScoreMap getNamespaceNavigator() { if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace") < 0) return new ClusteredScoreMap(); if (this.namespaceNavigator.sizeSmaller(2)) this.namespaceNavigator.clear(); // navigators with one entry are not useful return this.namespaceNavigator; } - + public ScoreMap getHostNavigator() { - ScoreMap result = new ConcurrentScoreMap(); + final ScoreMap result = new ConcurrentScoreMap(); if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return result; - + final Iterator domhashs = this.hostNavigator.keys(false); URIMetadataRow row; byte[] urlhash; @@ -613,14 +613,14 @@ public final class RankingProcess extends Thread { } public static final Comparator> mecomp = new Comparator>() { - public int compare(Map.Entry o1, Map.Entry o2) { + public int compare(final Map.Entry o1, final Map.Entry o2) { if (o1.getValue().intValue() < o2.getValue().intValue()) return 1; if (o2.getValue().intValue() < o1.getValue().intValue()) return -1; return 0; } }; - public ScoreMap getTopicNavigator(int count) { + public ScoreMap getTopicNavigator(final int count) { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls final ScoreMap result = new ConcurrentScoreMap(); @@ -645,38 +645,40 @@ public final class RankingProcess extends Thread { counts.put(word, q); } } - if (max > min) for (Map.Entry ce: counts.entrySet()) { + if (max > min) for (final Map.Entry ce: counts.entrySet()) { result.set(ce.getKey(), (int) (((double) count) * (ce.getValue() - min) / (max - min))); } return this.ref; } - + + private final static Pattern lettermatch = Pattern.compile("[a-z]+"); + public void addTopic(final String[] words) { String word; for (final String w : words) { word = w.toLowerCase(); if (word.length() > 2 && "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off".indexOf(word) < 0 && - !query.queryHashes.has(Word.word2hash(word)) && - word.matches("[a-z]+") && + !this.query.queryHashes.has(Word.word2hash(word)) && + lettermatch.matcher(word).matches() && !Switchboard.badwords.contains(word) && !Switchboard.stopwords.contains(word)) { - ref.inc(word); + this.ref.inc(word); } } } - + protected void addTopics(final ResultEntry resultEntry) { // take out relevant information for reference computation if ((resultEntry.url() == null) || (resultEntry.title() == null)) return; //final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url final String[] descrcomps = MultiProtocolURI.splitpattern.split(resultEntry.title().toLowerCase()); // words in the description - + // add references //addTopic(urlcomps); addTopic(descrcomps); } - + public ScoreMap getAuthorNavigator() { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls diff --git a/source/de/anomic/yacy/dht/Dispatcher.java b/source/de/anomic/yacy/dht/Dispatcher.java index 434b1af4f..776f6f956 100755 --- a/source/de/anomic/yacy/dht/Dispatcher.java +++ b/source/de/anomic/yacy/dht/Dispatcher.java @@ -1,4 +1,4 @@ -// Dispatcher.java +// Dispatcher.java // ------------------------------ // part of YaCy // (C) 2009 by Michael Peter Christen; mc@yacy.net @@ -28,6 +28,7 @@ package de.anomic.yacy.dht; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -41,11 +42,9 @@ import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.ByteArray; import net.yacy.kelondro.workflow.WorkflowProcessor; - import de.anomic.search.Segment; import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeedDB; -import java.util.List; public class Dispatcher { @@ -80,30 +79,30 @@ public class Dispatcher { * or the target queue runs out of entries. If the target queue is empty, the transmission is * called failed. In case of a fail, the RWI fragment is put back into the backend index structure */ - + // a cloud is a cache for the objects that wait to be transmitted // the String-key is the primary target as contained in the Entry private Map transmissionCloud; - + // the segment backend is used to store the remaining indexContainers in case that the object is closed private final Segment segment; - + // the seed database private final yacySeedDB seeds; - + // the log private final Log log; - + // transmission process private WorkflowProcessor indexingTransmissionProcessor; - + // transmission object private final Transmission transmission; - + public Dispatcher( final Segment segment, final yacySeedDB seeds, - final boolean gzipBody, + final boolean gzipBody, final int timeout ) { this.transmissionCloud = new ConcurrentHashMap(); @@ -111,28 +110,28 @@ public class Dispatcher { this.seeds = seeds; this.log = new Log("INDEX-TRANSFER-DISPATCHER"); this.transmission = new Transmission( - log, + this.log, segment, seeds, gzipBody, timeout); - - int concurrentSender = Math.min(32, Math.max(10, WorkflowProcessor.availableCPU)); - indexingTransmissionProcessor = new WorkflowProcessor( + + final int concurrentSender = Math.min(32, Math.max(10, WorkflowProcessor.availableCPU)); + this.indexingTransmissionProcessor = new WorkflowProcessor( "transferDocumentIndex", "This is the RWI transmission process", new String[]{"RWI/Cache/Collections"}, this, "transferDocumentIndex", concurrentSender * 2, null, concurrentSender); } - + public int cloudSize() { return (this.transmissionCloud == null) ? 0 : this.transmissionCloud.size(); } - + public int transmissionSize() { return (this.indexingTransmissionProcessor == null) ? 0 : this.indexingTransmissionProcessor.queueSize(); } - + /** * PROCESS(1) * select a number of index containers from the backend index. @@ -150,15 +149,15 @@ public class Dispatcher { final int maxContainerCount, final int maxReferenceCount, final int maxtime) throws IOException { - + // prefer file - ArrayList> containers = selectContainers(hash, limitHash, maxContainerCount, maxReferenceCount, maxtime, false); + final ArrayList> containers = selectContainers(hash, limitHash, maxContainerCount, maxReferenceCount, maxtime, false); // if ram does not provide any result, take from file //if (containers.isEmpty()) containers = selectContainers(hash, limitHash, maxContainerCount, maxtime, false); return containers; } - + private ArrayList> selectContainers( final byte[] hash, final byte[] limitHash, @@ -166,9 +165,9 @@ public class Dispatcher { final int maxReferenceCount, final int maxtime, final boolean ram) throws IOException { - + final ArrayList> containers = new ArrayList>(maxContainerCount); - + final Iterator> indexContainerIterator = this.segment.termIndex().references(hash, true, ram); ReferenceContainer container; int refcount = 0; @@ -183,7 +182,7 @@ public class Dispatcher { ((container = indexContainerIterator.next()) != null) && ((containers.isEmpty()) || (Base64Order.enhancedCoder.compare(container.getTermHash(), limitHash) < 0)) - + ) { if (container.isEmpty()) continue; refcount += container.size(); @@ -193,12 +192,12 @@ public class Dispatcher { final ArrayList> rc; if (ram) { // selection was only from ram, so we have to carefully remove only the selected entries - HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); + final HandleSet urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); Iterator it; - for (ReferenceContainer c: containers) { + for (final ReferenceContainer c: containers) { urlHashes.clear(); it = c.entries(); - while (it.hasNext()) try { urlHashes.put(it.next().urlhash()); } catch (RowSpaceExceededException e) { Log.logException(e); } + while (it.hasNext()) try { urlHashes.put(it.next().urlhash()); } catch (final RowSpaceExceededException e) { Log.logException(e); } if (this.log.isFine()) this.log.logFine("selected " + urlHashes.size() + " urls for word '" + ASCII.String(c.getTermHash()) + "'"); if (!urlHashes.isEmpty()) this.segment.termIndex().remove(c.getTermHash(), urlHashes); } @@ -207,7 +206,7 @@ public class Dispatcher { // selection was from whole index, so we can just delete the whole container // but to avoid race conditions return the results from the deletes rc = new ArrayList>(containers.size()); - for (ReferenceContainer c: containers) { + for (final ReferenceContainer c: containers) { container = this.segment.termIndex().delete(c.getTermHash()); // be aware this might be null! if (container != null && !container.isEmpty()) { if (this.log.isFine()) this.log.logFine("selected " + container.size() + " urls for word '" + ASCII.String(c.getTermHash()) + "'"); @@ -215,7 +214,7 @@ public class Dispatcher { } } } - + // finished. The caller must take care of the containers and must put them back if not needed return rc; } @@ -226,33 +225,33 @@ public class Dispatcher { * @param containers * @param scheme * @return - * @throws RowSpaceExceededException + * @throws RowSpaceExceededException */ @SuppressWarnings("unchecked") - private List>[] splitContainers(List> containers) throws RowSpaceExceededException { - + private List>[] splitContainers(final List> containers) throws RowSpaceExceededException { + // init the result vector - int partitionCount = this.seeds.scheme.verticalPartitions(); - List>[] partitions = (ArrayList>[]) new ArrayList[partitionCount]; + final int partitionCount = this.seeds.scheme.verticalPartitions(); + final List>[] partitions = new ArrayList[partitionCount]; for (int i = 0; i < partitions.length; i++) partitions[i] = new ArrayList>(); - + // check all entries and split them to the partitions - ReferenceContainer[] partitionBuffer = new ReferenceContainer[partitionCount]; + final ReferenceContainer[] partitionBuffer = new ReferenceContainer[partitionCount]; WordReference re; - for (ReferenceContainer container: containers) { + for (final ReferenceContainer container: containers) { // init the new partitions for (int j = 0; j < partitionBuffer.length; j++) { partitionBuffer[j] = new ReferenceContainer(Segment.wordReferenceFactory, container.getTermHash(), container.size() / partitionCount); } // split the container - Iterator i = container.entries(); + final Iterator i = container.entries(); while (i.hasNext()) { re = i.next(); if (re == null) continue; partitionBuffer[this.seeds.scheme.verticalPosition(re.urlhash())].add(re); } - + // add the containers to the result vector for (int j = 0; j < partitionBuffer.length; j++) { partitions[j].add(partitionBuffer[j]); @@ -260,7 +259,7 @@ public class Dispatcher { } return partitions; } - + /** * PROCESS(3) and PROCESS(4) * put containers into cloud. This needs information about the network, @@ -272,7 +271,7 @@ public class Dispatcher { * then no additional IO is necessary. */ private void enqueueContainersToCloud(final List>[] containers) { - if (transmissionCloud == null) return; + if (this.transmissionCloud == null) return; ReferenceContainer lastContainer; byte[] primaryTarget; ByteArray pTArray; @@ -283,16 +282,16 @@ public class Dispatcher { primaryTarget = FlatWordPartitionScheme.positionToHash(this.seeds.scheme.dhtPosition(lastContainer.getTermHash(), vertical)); assert primaryTarget[2] != '@'; pTArray = new ByteArray(primaryTarget); - + // get or make a entry object entry = this.transmissionCloud.get(pTArray); // if this is not null, the entry is extended here - List targets = PeerSelection.getAcceptRemoteIndexSeedsList( - seeds, + final List targets = PeerSelection.getAcceptRemoteIndexSeedsList( + this.seeds, primaryTarget, - seeds.redundancy() * 3, + this.seeds.redundancy() * 3, true); this.log.logInfo("enqueueContainers: selected " + targets.size() + " targets for primary target key " + ASCII.String(primaryTarget) + "/" + vertical + " with " + containers[vertical].size() + " index containers."); - if (entry == null) entry = transmission.newChunk(primaryTarget, targets); + if (entry == null) entry = this.transmission.newChunk(primaryTarget, targets); /*/ lookup targets int sc = 1; @@ -303,17 +302,17 @@ public class Dispatcher { this.log.logInfo("enqueueContainers: distance to first container at position " + sc + ": " + FlatWordPartitionScheme.std.dhtDistance(FlatWordPartitionScheme.positionToHash(this.seeds.scheme.dhtPosition(containers[vertical].get(0).getTermHash(), vertical)), null, seed)); sc++; }*/ - + // fill the entry with the containers - for (ReferenceContainer c: containers[vertical]) { + for (final ReferenceContainer c: containers[vertical]) { try { entry.add(c); - } catch (RowSpaceExceededException e) { + } catch (final RowSpaceExceededException e) { Log.logException(e); break; } } - + // put the entry into the cloud if (entry.containersSize() > 0) this.transmissionCloud.put(pTArray, entry); } @@ -330,12 +329,12 @@ public class Dispatcher { List> selectedContainerCache; try { selectedContainerCache = selectContainers(hash, limitHash, maxContainerCount, maxReferenceCount, maxtime); - } catch (IOException e) { + } catch (final IOException e) { this.log.logSevere("selectContainersEnqueueToCloud: selectedContainer failed", e); return false; } this.log.logInfo("selectContainersEnqueueToCloud: selectedContainerCache was filled with " + selectedContainerCache.size() + " entries"); - + if (selectedContainerCache == null || selectedContainerCache.isEmpty()) { this.log.logInfo("selectContainersEnqueueToCloud: selectedContainerCache is empty, cannot do anything here."); return false; @@ -344,7 +343,7 @@ public class Dispatcher { List>[] splitContainerCache; try { splitContainerCache = splitContainers(selectedContainerCache); - } catch (RowSpaceExceededException e) { + } catch (final RowSpaceExceededException e) { this.log.logSevere("selectContainersEnqueueToCloud: splitContainers failed because of too low RAM", e); return false; } @@ -363,7 +362,7 @@ public class Dispatcher { this.log.logInfo("selectContainersEnqueueToCloud: splitContainerCache enqueued to cloud array which has now " + this.transmissionCloud.size() + " entries."); return true; } - + /** * PROCESS(5) * take the largest container from the cloud and put it into the 'next' array, @@ -371,43 +370,50 @@ public class Dispatcher { * This method returns true if a container was dequeued, false if not */ public boolean dequeueContainer() { - if (transmissionCloud == null) return false; - if (this.indexingTransmissionProcessor.queueSize() > indexingTransmissionProcessor.concurrency()) return false; + if (this.transmissionCloud == null) return false; + if (this.indexingTransmissionProcessor.queueSize() > this.indexingTransmissionProcessor.concurrency()) return false; ByteArray maxtarget = null; int maxsize = -1; - for (Map.Entry chunk: this.transmissionCloud.entrySet()) { + for (final Map.Entry chunk: this.transmissionCloud.entrySet()) { if (chunk.getValue().containersSize() > maxsize) { maxsize = chunk.getValue().containersSize(); maxtarget = chunk.getKey(); } } if (maxsize < 0) return false; - Transmission.Chunk chunk = this.transmissionCloud.remove(maxtarget); + final Transmission.Chunk chunk = this.transmissionCloud.remove(maxtarget); try { this.indexingTransmissionProcessor.enQueue(chunk); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); } return true; } - - public Transmission.Chunk transferDocumentIndex(Transmission.Chunk chunk) { + + /** + * transfer job: this method is called using reflection from the switchboard + * the method is called as a Workflow process. That means it is always called whenever + * a job is placed in the workflow queue. This happens in dequeueContainer() + * @param chunk + * @return + */ + public Transmission.Chunk transferDocumentIndex(final Transmission.Chunk chunk) { // do the transmission - boolean success = chunk.transmit(); - + final boolean success = chunk.transmit(); + if (success && chunk.isFinished()) { // finished with this queue! this.log.logInfo("STORE: Chunk " + ASCII.String(chunk.primaryTarget()) + " has FINISHED all transmissions!"); return chunk; } - + if (!success) this.log.logInfo("STORE: Chunk " + ASCII.String(chunk.primaryTarget()) + " has failed to transmit index; marked peer as busy"); - + if (chunk.canFinish()) { try { if (this.indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.enQueue(chunk); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Log.logException(e); return null; } @@ -420,12 +426,12 @@ public class Dispatcher { public void close() { // removes all entries from the dispatcher and puts them back to a RAMRI - if (indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.announceShutdown(); + if (this.indexingTransmissionProcessor != null) this.indexingTransmissionProcessor.announceShutdown(); if (this.transmissionCloud != null) { - outerLoop: for (Map.Entry e : this.transmissionCloud.entrySet()) { - for (ReferenceContainer i : e.getValue()) try { + outerLoop: for (final Map.Entry e : this.transmissionCloud.entrySet()) { + for (final ReferenceContainer i : e.getValue()) try { this.segment.termIndex().add(i); - } catch (Exception e1) { + } catch (final Exception e1) { Log.logException(e1); break outerLoop; } @@ -433,11 +439,11 @@ public class Dispatcher { this.transmissionCloud.clear(); } this.transmissionCloud = null; - if (indexingTransmissionProcessor != null) { + if (this.indexingTransmissionProcessor != null) { this.indexingTransmissionProcessor.awaitShutdown(10000); this.indexingTransmissionProcessor.clear(); } this.indexingTransmissionProcessor = null; } - + } diff --git a/source/net/yacy/kelondro/logging/SimpleLogFormatter.java b/source/net/yacy/kelondro/logging/SimpleLogFormatter.java index 240114770..3fa4ac75d 100644 --- a/source/net/yacy/kelondro/logging/SimpleLogFormatter.java +++ b/source/net/yacy/kelondro/logging/SimpleLogFormatter.java @@ -1,4 +1,4 @@ -//SimpleLogFormatter.java +//SimpleLogFormatter.java //------------------------------------- //part of YACY //(C) by Michael Peter Christen; mc@yacy.net @@ -50,13 +50,13 @@ public final class SimpleLogFormatter extends SimpleFormatter { public SimpleLogFormatter() { super(); } - + @Override public final synchronized String format(final LogRecord record) { - + final StringBuffer stringBuffer = this.buffer; stringBuffer.setLength(0); - + // adding the loglevel final int logLevel = record.getLevel().intValue(); if (logLevel == Log.LOGLEVEL_SEVERE) @@ -67,16 +67,16 @@ public final class SimpleLogFormatter extends SimpleFormatter { this.buffer.append(Log.LOGTOKEN_CONFIG); else if (logLevel == Log.LOGLEVEL_INFO) this.buffer.append(Log.LOGTOKEN_INFO); - else if (logLevel == Log.LOGLEVEL_FINE) + else if (logLevel == Log.LOGLEVEL_FINE) this.buffer.append(Log.LOGTOKEN_FINE); - else if (logLevel == Log.LOGLEVEL_FINER) - this.buffer.append(Log.LOGTOKEN_FINER); - else if (logLevel == Log.LOGLEVEL_FINEST) - this.buffer.append(Log.LOGTOKEN_FINEST); - else + else if (logLevel == Log.LOGLEVEL_FINER) + this.buffer.append(Log.LOGTOKEN_FINER); + else if (logLevel == Log.LOGLEVEL_FINEST) + this.buffer.append(Log.LOGTOKEN_FINEST); + else this.buffer.append(Log.LOGTOKEN_FINE); this.buffer.append(' '); - + // adding the logging date this.date.setTime(record.getMillis()); this.position.setBeginIndex(0); @@ -85,11 +85,11 @@ public final class SimpleLogFormatter extends SimpleFormatter { // adding the logger name stringBuffer.append(' '); stringBuffer.append(record.getLoggerName()); - + // adding the logging message stringBuffer.append(' '); stringBuffer.append(formatMessage(record)); - + // adding the stack trace if available stringBuffer.append(System.getProperty("line.separator")); if (record.getThrown() != null) { diff --git a/source/net/yacy/kelondro/table/Table.java b/source/net/yacy/kelondro/table/Table.java index c0b9cc0e9..abc7474ce 100644 --- a/source/net/yacy/kelondro/table/Table.java +++ b/source/net/yacy/kelondro/table/Table.java @@ -464,17 +464,17 @@ public class Table implements Index, Iterable { final int i = (int) this.index.get(key); if (i == -1) return null; final byte[] b = new byte[this.rowdef.objectsize]; - if (this.table == null) { + final Row.Entry cacherow; + if (this.table == null || (cacherow = this.table.get(i, false)) == null) { // read row from the file this.file.get(i, b, 0); } else { // construct the row using the copy in RAM - final Row.Entry v = this.table.get(i, false); - assert v != null; - if (v == null) return null; + assert cacherow != null; + if (cacherow == null) return null; assert key.length == this.rowdef.primaryKeyLength; System.arraycopy(key, 0, b, 0, key.length); - System.arraycopy(v.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength); + System.arraycopy(cacherow.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength); } return this.rowdef.newEntry(b); } @@ -503,7 +503,7 @@ public class Table implements Index, Iterable { assert this.table == null || this.table.size() == this.index.size() : "table.size() = " + this.table.size() + ", index.size() = " + this.index.size(); assert row != null; assert row.bytes() != null; - if ((row == null) || (row.bytes() == null)) return null; + if (row == null || row.bytes() == null) return null; final int i = (int) this.index.get(row.getPrimaryKeyBytes()); if (i == -1) { try { @@ -517,17 +517,17 @@ public class Table implements Index, Iterable { } final byte[] b = new byte[this.rowdef.objectsize]; - if (this.table == null) { + Row.Entry cacherow; + if (this.table == null || (cacherow = this.table.get(i, false)) == null) { // read old value this.file.get(i, b, 0); // write new value this.file.put(i, row.bytes(), 0); } else { // read old value - final Row.Entry v = this.table.get(i, false); - assert v != null; + assert cacherow != null; System.arraycopy(row.getPrimaryKeyBytes(), 0, b, 0, this.rowdef.primaryKeyLength); - System.arraycopy(v.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength); + System.arraycopy(cacherow.bytes(), 0, b, this.rowdef.primaryKeyLength, this.rowdef.objectsize - this.rowdef.primaryKeyLength); // write new value try { this.table.set(i, this.taildef.newEntry(row.bytes(), this.rowdef.primaryKeyLength, true)); @@ -573,13 +573,12 @@ public class Table implements Index, Iterable { this.file.put(i, row.bytes(), 0); } else { // write new value - try { + this.file.put(i, row.bytes(), 0); + if (abandonTable()) this.table = null; else try { this.table.set(i, this.taildef.newEntry(row.bytes(), this.rowdef.primaryKeyLength, true)); } catch (final RowSpaceExceededException e) { this.table = null; } - if (abandonTable()) this.table = null; - this.file.put(i, row.bytes(), 0); } assert this.file.size() == this.index.size() : "file.size() = " + this.file.size() + ", index.size() = " + this.index.size(); assert this.table == null || this.table.size() == this.index.size() : "table.size() = " + this.table.size() + ", index.size() = " + this.index.size(); @@ -665,7 +664,8 @@ public class Table implements Index, Iterable { final int sb = this.index.size(); int ix; assert i < this.index.size(); - if (this.table == null) { + final Row.Entry cacherow; + if (this.table == null || (cacherow = this.table.get(i, false)) == null) { if (i == this.index.size() - 1) { // element is at last entry position ix = (int) this.index.remove(key); @@ -697,9 +697,8 @@ public class Table implements Index, Iterable { assert this.file.size() == this.index.size() : "file.size() = " + this.file.size() + ", index.size() = " + this.index.size(); } else { // get result value from the table copy, so we don't need to read it from the file - final Row.Entry v = this.table.get(i, false); System.arraycopy(key, 0, b, 0, key.length); - System.arraycopy(v.bytes(), 0, b, this.rowdef.primaryKeyLength, this.taildef.objectsize); + System.arraycopy(cacherow.bytes(), 0, b, this.rowdef.primaryKeyLength, this.taildef.objectsize); if (i == this.index.size() - 1) { // special handling if the entry is the last entry in the file @@ -911,7 +910,8 @@ public class Table implements Index, Iterable { this.c = (int) Table.this.index.get(k); if (this.c < 0) throw new ConcurrentModificationException(); // this should only happen if the table was modified during the iteration final byte[] b = new byte[Table.this.rowdef.objectsize]; - if (Table.this.table == null) { + final Row.Entry cacherow; + if (Table.this.table == null || (cacherow = Table.this.table.get(this.c, false)) == null) { // read from file try { Table.this.file.get(this.c, b, 0); @@ -921,11 +921,10 @@ public class Table implements Index, Iterable { } } else { // compose from table and key - final Row.Entry v = Table.this.table.get(this.c, false); - assert v != null; - if (v == null) return null; + assert cacherow != null; + if (cacherow == null) return null; System.arraycopy(k, 0, b, 0, Table.this.rowdef.primaryKeyLength); - System.arraycopy(v.bytes(), 0, b, Table.this.rowdef.primaryKeyLength, Table.this.taildef.objectsize); + System.arraycopy(cacherow.bytes(), 0, b, Table.this.rowdef.primaryKeyLength, Table.this.taildef.objectsize); } return Table.this.rowdef.newEntry(b); }