diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index b8191d340..15c24e20a 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -300,7 +300,7 @@ public class IndexControl_p { "true".equalsIgnoreCase(gzipBody), timeout); result = (String) resultObj.get("result"); - prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); + prop.put("result", (result == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries + " URL not found") : result); index = null; } diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index f9152a133..9de800a11 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -203,7 +203,7 @@ public final class transferRWI { } if (unknownURLs.length() > 0) { unknownURLs.delete(0, 1); } if ((wordhashes.length == 0) || (received == 0)) { - sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs"); + sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs, blocked " + blocked + " RWIs"); } else { final double avdist = (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[0]) + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[received - 1])) / 2.0; sb.getLog().logInfo("Received " + received + " Entries " + wordc + " Words [" + wordhashes[0] + " .. " + wordhashes[received - 1] + "]/" + avdist + " from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + "/" + receivedURL + " URLs, blocked " + blocked + " RWIs"); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index bfcdbce05..36d3209f4 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -46,10 +46,12 @@ // javac -classpath .:../classes transferRWI.java import java.io.IOException; +import java.text.ParseException; import de.anomic.http.httpHeader; import de.anomic.index.indexURLEntry; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaURL; import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; @@ -59,11 +61,14 @@ import de.anomic.yacy.yacySeed; public final class transferURL { + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException { if (post == null || ss == null) { return null; } long start = System.currentTimeMillis(); - + long freshdate = 0; + try {freshdate = plasmaURL.shortDayFormatter.parse("20061101").getTime();} catch (ParseException e1) {} + // return variable that accumulates replacements final plasmaSwitchboard sb = (plasmaSwitchboard) ss; final serverObjects prop = new serverObjects(); @@ -93,35 +98,45 @@ public final class transferURL { indexURLEntry lEntry; for (int i = 0; i < urlc; i++) { serverCore.checkInterruption(); + + // read new lurl-entry urls = (String) post.get("url" + i); if (urls == null) { yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName); - } else { - lEntry = sb.wordIndex.loadedURL.newEntry(urls); - if (lEntry == null) { - yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); - // TODO: should we send back an error message??? - } else { - indexURLEntry.Components comp = lEntry.comp(); - if (comp.url() == null) { - yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls); - // TODO: should we send back an error message??? - } else { - if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) { - int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash()); - yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); - lEntry = null; - blocked++; - } else try { - sb.wordIndex.loadedURL.store(lEntry); - sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3); - yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName); - received++; - } catch (IOException e) { - e.printStackTrace(); - } - } - } + continue; + } + + // parse new lurl-entry + lEntry = sb.wordIndex.loadedURL.newEntry(urls); + if (lEntry == null) { + yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); + continue; + } + + // check if entry is well-formed + indexURLEntry.Components comp = lEntry.comp(); + if ((comp.url() == null) || (lEntry.freshdate().getTime() <= freshdate)) { + yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls); + continue; + } + + // check if the entry is blacklisted + if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) { + int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash()); + yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); + lEntry = null; + blocked++; + continue; + } + + // write entry to database + try { + sb.wordIndex.loadedURL.store(lEntry); + sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3); + yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName); + received++; + } catch (IOException e) { + e.printStackTrace(); } } diff --git a/source/de/anomic/index/indexRWIEntryNew.java b/source/de/anomic/index/indexRWIEntryNew.java index d9eae2d2f..5894b242c 100644 --- a/source/de/anomic/index/indexRWIEntryNew.java +++ b/source/de/anomic/index/indexRWIEntryNew.java @@ -117,7 +117,6 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry { int sizeOfPage, // # of bytes of the page TODO: not needed any more long lastmodified, // last-modified time of the document where word appears long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short - int quality, // the entropy value String language, // (guessed) language of document char doctype, // type of document int outlinksSame, // outlinks to same domain diff --git a/source/de/anomic/index/indexRWIEntryOld.java b/source/de/anomic/index/indexRWIEntryOld.java index 25f3a93a8..268f9dec2 100644 --- a/source/de/anomic/index/indexRWIEntryOld.java +++ b/source/de/anomic/index/indexRWIEntryOld.java @@ -33,7 +33,6 @@ import de.anomic.kelondro.kelondroColumn; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow.Entry; import de.anomic.plasma.plasmaSearchQuery; -import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaWordIndex; import de.anomic.yacy.yacySeedDB; @@ -66,7 +65,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { private static final int col_hitcount = 3; private static final int col_language = 4; private static final int col_doctype = 5; - private static final int col_localflag = 6; + //private static final int col_localflag = 6; private static final int col_posintext = 7; private static final int col_posinphrase = 8; private static final int col_posofphrase = 9; @@ -77,6 +76,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { private kelondroRow.Entry entry; + /* public indexRWIEntryOld(String urlHash, int urlLength, // byte-length of complete URL int urlComps, // number of path components @@ -91,7 +91,6 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { int sizeOfPage, // # of bytes of the page long lastmodified, //*last-modified time of the document where word appears long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short - int quality, //*the entropy value String language, //*(guessed) language of document char doctype, //*type of document int outlinksSame, // outlinks to same domain @@ -107,7 +106,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk"; this.entry = urlEntryRow.newEntry(); this.entry.setCol(col_urlhash, urlHash, null); - this.entry.setCol(col_quality, quality); + this.entry.setCol(col_quality, 0); this.entry.setCol(col_lastModified, lastmodified); this.entry.setCol(col_hitcount, hitcount); this.entry.setCol(col_language, language, null); @@ -121,7 +120,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry { this.entry.setCol(col_phrasecount, phrasecount); //System.out.println("DEBUG-NEWENTRY " + toPropertyForm()); } - +*/ public indexRWIEntryOld(String urlHash, String code) { // the code is the external form of the row minus the leading urlHash entry this.entry = urlEntryRow.newEntry((urlHash + code).getBytes()); diff --git a/source/de/anomic/kelondro/kelondroBitfield.java b/source/de/anomic/kelondro/kelondroBitfield.java index c7560ea8b..ce1042987 100644 --- a/source/de/anomic/kelondro/kelondroBitfield.java +++ b/source/de/anomic/kelondro/kelondroBitfield.java @@ -24,7 +24,7 @@ package de.anomic.kelondro; -public class kelondroBitfield { +public class kelondroBitfield implements Cloneable { // the bitfield implements a binary array. Such arrays may be exported in a base64-String @@ -55,6 +55,12 @@ public class kelondroBitfield { } } + public Object clone() { + kelondroBitfield theClone = new kelondroBitfield(new byte[this.bb.length]); + System.arraycopy(this.bb, 0, theClone.bb, 0, this.bb.length); + return theClone; + } + public void set(int pos, boolean value) { assert (pos >= 0); int slot = pos / 8; diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java index e573f7532..8644b5c4d 100644 --- a/source/de/anomic/kelondro/kelondroRecords.java +++ b/source/de/anomic/kelondro/kelondroRecords.java @@ -1392,7 +1392,7 @@ public class kelondroRecords { USAGE.FREEC--; // take link if (USAGE.FREEH.index == NUL) { - System.out.println("INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records. Affected file: " + filename); + serverLog.logSevere("kelondroRecords/" + filename, "INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records."); // try to heal.. USAGE.USEDC = USAGE.allCount() + 1; USAGE.FREEC = 0; @@ -1402,10 +1402,17 @@ public class kelondroRecords { //System.out.println("*DEBUG* ALLOCATED DELETED INDEX " + index); // check for valid seek position long seekp = seekpos(USAGE.FREEH); - if (seekp > entryFile.length()) throw new kelondroException("new Handle: seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize)); - - // read link to next element of FREEH chain - USAGE.FREEH.index = entryFile.readInt(seekp); + if (seekp > entryFile.length()) { + // this is a severe inconsistency. try to heal.. + serverLog.logSevere("kelondroRecords/" + filename, "new Handle: lost " + USAGE.FREEC + " marked nodes; seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize)); + index = USAGE.allCount(); // a place at the end of the file + USAGE.USEDC += USAGE.FREEC; // to avoid that non-empty records at the end are overwritten + USAGE.FREEC = 0; // discard all possible empty nodes + USAGE.FREEH.index = NUL; + } else { + // read link to next element of FREEH chain + USAGE.FREEH.index = entryFile.readInt(seekp); + } } USAGE.write(); } diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 76d57dfe8..6fe6298f7 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -49,7 +49,6 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; -import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -66,6 +65,8 @@ import java.util.TreeMap; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.htmlFilter.htmlFilterImageEntry; +import de.anomic.index.indexRWIEntryNew; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroMSetTools; @@ -114,20 +115,123 @@ public final class plasmaCondenser { public int RESULT_NUMB_WORDS = -1; public int RESULT_DIFF_WORDS = -1; public int RESULT_SIMI_WORDS = -1; - public int RESULT_WORD_ENTROPHY = -1; public int RESULT_NUMB_SENTENCES = -1; public int RESULT_DIFF_SENTENCES = -1; public int RESULT_SIMI_SENTENCES = -1; public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4); - public plasmaCondenser(plasmaParserDocument document) throws UnsupportedEncodingException { + public plasmaCondenser(plasmaParserDocument document, boolean addMedia) throws UnsupportedEncodingException { + // if addMedia == true, then all the media links are also parsed and added to the words + // added media words are flagged with the approriate media flag this(document.getText(), document.getCharset()); + + kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19 + // construct flag set for document if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true); if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true); if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true); if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true); + + // the phrase counter: + // phrase 0 are words taken from the URL + // phrase 1 is the MainLongTitle + // phrase 2 is the MainShortTitle + // phrase 3 is the Document Abstract + // phrase 4 is the Document Author + // phrase 5 are the tags specified in document + // phrase 10 and above are the section headlines/titles (88 possible) + // phrase 98 is taken from the embedded anchor/hyperlinks description + // phrase 99 is taken from the media Link url and anchor description + // phrase 100 and above are lines from the text + + insertTextToWords(document.getMainLongTitle(), 1, indexRWIEntryNew.flag_app_descr, wflags); + insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags); + insertTextToWords(document.getAbstract(), 3, indexRWIEntryNew.flag_app_descr, wflags); + // missing: author! + // missing: tags! + String[] titles = document.getSectionTitles(); + for (int i = 0; 1 < titles.length; i++) { + insertTextToWords(titles[i], i + 10, indexRWIEntryNew.flag_app_emphasized, wflags); + } + + // anchors + Iterator i = document.getAnchors().entrySet().iterator(); + Map.Entry entry; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + insertTextToWords((String) entry.getKey(), 98, indexRWIEntryNew.flag_app_url, wflags); + insertTextToWords((String) entry.getValue(), 98, indexRWIEntryNew.flag_app_url, wflags); + } + + // audio + i = document.getAudiolinks().entrySet().iterator(); + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, wflags); + insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, wflags); + } + + // video + i = document.getVideolinks().entrySet().iterator(); + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, wflags); + insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, wflags); + } + + // applications + i = document.getApplinks().entrySet().iterator(); + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, wflags); + insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, wflags); + } + + // images + i = document.getImages().iterator(); + htmlFilterImageEntry ientry; + while (i.hasNext()) { + ientry = (htmlFilterImageEntry) i.next(); + insertTextToWords((String) ientry.url().toNormalform(), 99, flag_cat_hasimage, wflags); + insertTextToWords((String) ientry.alt(), 99, flag_cat_hasimage, wflags); + } + + // finally check all words for missing flag entry + i = words.entrySet().iterator(); + wordStatProp wprop; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + wprop = (wordStatProp) entry.getValue(); + if (wprop.flags == null) { + wprop.flags = (kelondroBitfield) wflags.clone(); + words.put(entry.getKey(), wprop); + } + } } + private void insertTextToWords(String text, int phrase, int flagpos, kelondroBitfield flagstemplate) { + String word; + wordStatProp wprop; + sievedWordsEnum wordenum; + try { + wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8", 3); + } catch (UnsupportedEncodingException e) { + return; + } + int pip = 0; + while (wordenum.hasMoreElements()) { + word = ((String) wordenum.nextElement()).toLowerCase(); + wprop = (wordStatProp) words.get(word); + if (wprop == null) wprop = new wordStatProp(0, pip, phrase); + if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone(); + wprop.numOfPhrase = 1; + wprop.posInPhrase = pip; + wprop.flags.set(flagpos, true); + words.put(word, wprop); + pip++; + } + } + public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException { this(text, charset, 3, 2); } @@ -174,18 +278,19 @@ public final class plasmaCondenser { } public Map words() { - // returns the words as wod/wordStatProp relation map + // returns the words as word/wordStatProp relation map return words; } public static class wordStatProp { // object carries statistics for words and sentences - public int count; // number of occurrences - public int posInText; // unique handle, is initialized with word position (excluding double occurring words) - public int posInPhrase; // - public int numOfPhrase; - public HashSet hash; // + public int count; // number of occurrences + public int posInText; // unique handle, is initialized with word position (excluding double occurring words) + public int posInPhrase; // position of word in phrase + public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100 + public HashSet hash; // a set of handles to all sentences where this word appears + public kelondroBitfield flags; // the flag bits for each word public wordStatProp(int handle, int pip, int nop) { this.count = 1; @@ -193,6 +298,7 @@ public final class plasmaCondenser { this.posInPhrase = pip; this.numOfPhrase = nop; this.hash = new HashSet(); + this.flags = null; } public void inc() { @@ -314,7 +420,7 @@ public final class plasmaCondenser { } else { // word does not yet exist, create new word entry wordHandle = wordHandleCount++; - wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 1); + wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 100); } words.put(word, wsp); // we now have the unique handle of the word, put it into the sentence: @@ -429,7 +535,6 @@ public final class plasmaCondenser { this.RESULT_NUMB_WORDS = allwordcounter; this.RESULT_DIFF_WORDS = wordHandleCount; this.RESULT_SIMI_WORDS = words.size(); - this.RESULT_WORD_ENTROPHY = (allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter); this.RESULT_NUMB_SENTENCES = allsentencecounter; this.RESULT_DIFF_SENTENCES = sentenceHandleCount; this.RESULT_SIMI_SENTENCES = sentences.size(); @@ -508,6 +613,7 @@ public final class plasmaCondenser { return orderedSentences; } + /* public void writeMapToFile(File out) throws IOException { Map.Entry entry; String k; @@ -520,7 +626,7 @@ public final class plasmaCondenser { // we reconstruct the word hashtable // and sort the entries by the number of occurrences // this structure is needed to print out a sorted list of words - TreeMap sortedWords = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/); + TreeMap sortedWords = new TreeMap(); //kelondroNaturalOrder.naturalOrder it = words.entrySet().iterator(); // enumerates the keys in ascending order while (it.hasNext()) { entry = (Map.Entry) it.next(); @@ -549,7 +655,7 @@ public final class plasmaCondenser { } writer.close(); } - +*/ public final static boolean invisible(char c) { // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars? if ((c < ' ') || (c > 'z')) return true; @@ -771,16 +877,22 @@ public final class plasmaCondenser { } - public static Map getWords(InputStream input, String charset) throws UnsupportedEncodingException { - if (input == null) return null; - plasmaCondenser condenser = new plasmaCondenser(input, charset); - return condenser.words; - } - public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException { + // returns a word/wordStatProp relation map if (text == null) return null; ByteArrayInputStream buffer = new ByteArrayInputStream(text); - return getWords(buffer, charset); + return new plasmaCondenser(buffer, charset, 2, 1).words(); + } + + public static Map getWords(String text) { + // returns a word/wordStatProp relation map + if (text == null) return null; + ByteArrayInputStream buffer = new ByteArrayInputStream(text.getBytes()); + try { + return new plasmaCondenser(buffer, "UTF-8", 2, 1).words(); + } catch (UnsupportedEncodingException e) { + return null; + } } public static void main(String[] args) { diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 3194f3360..532b7417c 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -220,6 +220,7 @@ public class plasmaParserDocument { public Map getAnchors() { // returns all links embedded as anchors (clickeable entities) + // this is a url(String)/text(String) map return anchors; } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index c0c9874c7..7028655bf 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -52,6 +52,7 @@ import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.Map; import java.util.Set; import de.anomic.http.httpHeader; @@ -255,27 +256,38 @@ public class plasmaSnippetCache { try { resContent.close(); } catch (Exception e) {/* ignore this */} } if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed - - //System.out.println("loaded document for URL " + url); - final Enumeration sentences = document.getSentences(pre); - document.close(); - //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]); - if (sentences == null) { - //System.out.println("found no sentences in url " + url); - return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences"); - } - + + /* =========================================================================== * COMPUTE SNIPPET * =========================================================================== */ // we have found a parseable non-empty file: use the lines - line = computeSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength); - //System.out.println("loaded snippet for URL " + url + ": " + line); + + // compute snippet from text + final Enumeration sentences = document.getSentences(pre); + if (sentences == null) return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences"); + String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength); + + // compute snippet from media + String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes); + String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes); + String appline = computeMediaSnippet(document.getApplinks(), queryhashes); + //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes); + //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes); + + line = ""; + if (audioline != null) line += (line.length() == 0) ? audioline : "
" + audioline; + if (videoline != null) line += (line.length() == 0) ? videoline : "
" + videoline; + if (appline != null) line += (line.length() == 0) ? appline : "
" + appline; + //if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline; + if (textline != null) line += (line.length() == 0) ? textline : "
" + textline; + if (line == null) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found"); if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength); // finally store this snippet in our own cache storeToCache(wordhashes, urlhash, line); + document.close(); return new Snippet(line, source, null); } @@ -366,7 +378,32 @@ public class plasmaSnippetCache { return (String) snippetsCache.get(key); } - private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) { + private String computeMediaSnippet(Map media, Set queryhashes) { + Iterator i = media.entrySet().iterator(); + Map.Entry entry; + String url, desc; + Set s; + String result = ""; + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + url = (String) entry.getKey(); + desc = (String) entry.getValue(); + s = removeAppearanceHashes(url, queryhashes); + if (s.size() == 0) { + result += "
" + ((desc.length() == 0) ? url : desc) + ""; + continue; + } + s = removeAppearanceHashes(desc, s); + if (s.size() == 0) { + result += "
" + ((desc.length() == 0) ? url : desc) + ""; + continue; + } + } + if (result.length() == 0) return null; + return result.substring(6); + } + + private String computeTextSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) { try { if (sentences == null) return null; if ((queryhashes == null) || (queryhashes.size() == 0)) return null; @@ -404,20 +441,43 @@ public class plasmaSnippetCache { shortLineLength = ((String) sb.get(i)).length(); } } + // find a first result - String result = (String) sb.get(shortLineIndex); - // remove all hashes that appear in the result - hs = hashSentence(result); + String result = computeTextSnippet((String) sb.get(shortLineIndex), queryhashes, minLength, maxLength); + Set remaininghashes = removeAppearanceHashes(result, queryhashes); + + if (remaininghashes.size() == 0) return result; + // the result has not all words in it. + // find another sentence that represents the missing other words + // and find recursively more sentences + maxLength = maxLength - result.length(); + if (maxLength < 20) maxLength = 20; + String nextSnippet = computeTextSnippet(sentences, remaininghashes, minLength, maxLength); + if (nextSnippet == null) return null; + return result + (" / " + nextSnippet); + } catch (IndexOutOfBoundsException e) { + log.logSevere("computeSnippet: error with string generation", e); + return ""; + } + } + + private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) { + try { + if (sentence == null) return null; + if ((queryhashes == null) || (queryhashes.size() == 0)) return null; + Iterator j; + HashMap hs; + String hash; + + // find all hashes that appear in the sentence + hs = hashSentence(sentence); j = queryhashes.iterator(); Integer pos; - Set remaininghashes = new HashSet(); - int p, minpos = result.length(), maxpos = -1; + int p, minpos = sentence.length(), maxpos = -1; while (j.hasNext()) { hash = (String) j.next(); pos = (Integer) hs.get(hash); - if (pos == null) { - remaininghashes.add(new String(hash)); - } else { + if (pos != null) { p = pos.intValue(); if (p > maxpos) maxpos = p; if (p < minpos) minpos = p; @@ -425,51 +485,62 @@ public class plasmaSnippetCache { } // check result size maxpos = maxpos + 10; - if (maxpos > result.length()) maxpos = result.length(); + if (maxpos > sentence.length()) maxpos = sentence.length(); if (minpos < 0) minpos = 0; // we have a result, but is it short enough? if (maxpos - minpos + 10 > maxLength) { // the string is too long, even if we cut at both ends // so cut here in the middle of the string - int lenb = result.length(); - result = result.substring(0, (minpos + 20 > result.length()) ? result.length() : minpos + 20).trim() + + int lenb = sentence.length(); + sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() + " [..] " + - result.substring((maxpos + 26 > result.length()) ? result.length() : maxpos + 26).trim(); - maxpos = maxpos + lenb - result.length() + 6; + sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim(); + maxpos = maxpos + lenb - sentence.length() + 6; } if (maxpos > maxLength) { // the string is too long, even if we cut it at the end // so cut it here at both ends at once int newlen = maxpos - minpos + 10; int around = (maxLength - newlen) / 2; - result = "[..] " + result.substring(minpos - around, ((maxpos + around) > result.length()) ? result.length() : (maxpos + around)).trim() + " [..]"; + sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]"; minpos = around; - maxpos = result.length() - around - 5; + maxpos = sentence.length() - around - 5; } - if (result.length() > maxLength) { - // trim result, 1st step (cut at right side) - result = result.substring(0, maxpos).trim() + " [..]"; + if (sentence.length() > maxLength) { + // trim sentence, 1st step (cut at right side) + sentence = sentence.substring(0, maxpos).trim() + " [..]"; } - if (result.length() > maxLength) { - // trim result, 2nd step (cut at left side) - result = "[..] " + result.substring(minpos).trim(); + if (sentence.length() > maxLength) { + // trim sentence, 2nd step (cut at left side) + sentence = "[..] " + sentence.substring(minpos).trim(); } - if (result.length() > maxLength) { - // trim result, 3rd step (cut in the middle) - result = result.substring(6, 20).trim() + " [..] " + result.substring(result.length() - 26, result.length() - 6).trim(); + if (sentence.length() > maxLength) { + // trim sentence, 3rd step (cut in the middle) + sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim(); } - if (queryhashes.size() == 0) return result; - // the result has not all words in it. - // find another sentence that represents the missing other words - // and find recursively more sentences - maxLength = maxLength - result.length(); - if (maxLength < 20) maxLength = 20; - String nextSnippet = computeSnippet(sentences, remaininghashes, minLength, maxLength); - return result + ((nextSnippet == null) ? "" : (" / " + nextSnippet)); + return sentence; } catch (IndexOutOfBoundsException e) { log.logSevere("computeSnippet: error with string generation", e); - return ""; + return null; + } + } + + private Set removeAppearanceHashes(String sentence, Set queryhashes) { + // remove all hashes that appear in the sentence + if (sentence == null) return queryhashes; + HashMap hs = hashSentence(sentence); + Iterator j = queryhashes.iterator(); + String hash; + Integer pos; + Set remaininghashes = new HashSet(); + while (j.hasNext()) { + hash = (String) j.next(); + pos = (Integer) hs.get(hash); + if (pos == null) { + remaininghashes.add(new String(hash)); + } } + return remaininghashes; } private HashMap hashSentence(String sentence) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index fa1c008bc..e42a43244 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1576,7 +1576,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); log.logFine("Condensing for '" + entry.normalizedURLString() + "'"); - plasmaCondenser condenser = new plasmaCondenser(document); + plasmaCondenser condenser = new plasmaCondenser(document, true); // generate citation reference Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther] @@ -1586,6 +1586,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser checkInterruption(); // create a new loaded URL db entry + long ldate = System.currentTimeMillis(); indexURLEntry newEntry = wordIndex.loadedURL.newEntry( entry.url(), // URL docDescription, // document description @@ -1594,7 +1595,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser "", // ETag docDate, // modification date new Date(), // loaded date - new Date(), // freshdate + new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula referrerUrlHash, // referer hash new byte[0], // md5 (int) entry.size(), // size @@ -1655,16 +1656,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser * STORE PAGE INDEX INTO WORD INDEX DB * ======================================================================== */ words = wordIndex.addPageIndex( - entry.url(), // document url - urlHash, // document url hash - docDate, // document mod date - (int) entry.size(), // document size - document, // document content - condenser, // document condenser + entry.url(), // document url + urlHash, // document url hash + docDate, // document mod date + (int) entry.size(), // document size + document, // document content + condenser, // document condenser plasmaURL.language(entry.url()), // document language plasmaURL.docType(document.getMimeType()), // document type - ioLinks[0].intValue(), // outlinkSame - ioLinks[1].intValue() // outlinkOthers + ioLinks[0].intValue(), // outlinkSame + ioLinks[1].intValue() // outlinkOthers ); } else { /* ======================================================================== @@ -1704,7 +1705,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser newEntry.size(), docDate.getTime(), System.currentTimeMillis(), - condenser.RESULT_WORD_ENTROPHY, language, doctype, ioLinks[0].intValue(), @@ -1749,7 +1749,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } tmpContainers = null; - } + } //end: SEND PAGE INDEX TO STORAGE PEER + storageEndTime = System.currentTimeMillis(); //increment number of indexed urls @@ -2253,7 +2254,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // get the word set Set words = null; try { - words = new plasmaCondenser(document).words().keySet(); + words = new plasmaCondenser(document, true).words().keySet(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index e126c51fe..0eaf94f26 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -251,22 +251,21 @@ public final class plasmaWordIndex implements indexRI { // this is called by the switchboard to put in a new page into the index // use all the words in one condenser object to simultanous create index entries - // iterate over all words + int wordCount = 0; + int urlLength = url.toString().length(); + int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length; + + // iterate over all words of context text Iterator i = condenser.words().entrySet().iterator(); Map.Entry wentry; String word; indexRWIEntry ientry; plasmaCondenser.wordStatProp wprop; - String wordHash; - int urlLength = url.toString().length(); - int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length; - while (i.hasNext()) { wentry = (Map.Entry) i.next(); word = (String) wentry.getKey(); wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); - // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); - wordHash = plasmaCondenser.word2hash(word); + assert (wprop.flags != null); ientry = new indexRWIEntryNew(urlHash, urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(), wprop.count, @@ -279,16 +278,15 @@ public final class plasmaWordIndex implements indexRI { size, urlModified.getTime(), System.currentTimeMillis(), - condenser.RESULT_WORD_ENTROPHY, language, doctype, outlinksSame, outlinksOther, - condenser.RESULT_FLAGS); - addEntry(wordHash, ientry, System.currentTimeMillis(), false); + wprop.flags); + addEntry(plasmaCondenser.word2hash(word), ientry, System.currentTimeMillis(), false); + wordCount++; } - // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + - // condenser.getWords().size() + " words, flushed " + c + " entries"); - return condenser.RESULT_SIMI_WORDS; + + return wordCount; } public indexContainer getContainer(String wordHash, Set urlselection, long maxTime) {