- added correct flagging of word properties

- added self-healing to database in case that wrong free-pointers exist
- added presentation of media links in snippets (does not yet work correctly)
- code cleanup

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3055 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 10d888e70c
commit bf0d820659

@ -300,7 +300,7 @@ public class IndexControl_p {
"true".equalsIgnoreCase(gzipBody),
timeout);
result = (String) resultObj.get("result");
prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
prop.put("result", (result == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries + " URL not found") : result);
index = null;
}

@ -203,7 +203,7 @@ public final class transferRWI {
}
if (unknownURLs.length() > 0) { unknownURLs.delete(0, 1); }
if ((wordhashes.length == 0) || (received == 0)) {
sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs");
sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs, blocked " + blocked + " RWIs");
} else {
final double avdist = (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[0]) + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[received - 1])) / 2.0;
sb.getLog().logInfo("Received " + received + " Entries " + wordc + " Words [" + wordhashes[0] + " .. " + wordhashes[received - 1] + "]/" + avdist + " from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + "/" + receivedURL + " URLs, blocked " + blocked + " RWIs");

@ -46,10 +46,12 @@
// javac -classpath .:../classes transferRWI.java
import java.io.IOException;
import java.text.ParseException;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
@ -59,11 +61,14 @@ import de.anomic.yacy.yacySeed;
public final class transferURL {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException {
if (post == null || ss == null) { return null; }
long start = System.currentTimeMillis();
long freshdate = 0;
try {freshdate = plasmaURL.shortDayFormatter.parse("20061101").getTime();} catch (ParseException e1) {}
// return variable that accumulates replacements
final plasmaSwitchboard sb = (plasmaSwitchboard) ss;
final serverObjects prop = new serverObjects();
@ -93,35 +98,45 @@ public final class transferURL {
indexURLEntry lEntry;
for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption();
// read new lurl-entry
urls = (String) post.get("url" + i);
if (urls == null) {
yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
} else {
lEntry = sb.wordIndex.loadedURL.newEntry(urls);
if (lEntry == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
} else {
indexURLEntry.Components comp = lEntry.comp();
if (comp.url() == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message???
} else {
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
blocked++;
} else try {
sb.wordIndex.loadedURL.store(lEntry);
sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
e.printStackTrace();
}
}
}
continue;
}
// parse new lurl-entry
lEntry = sb.wordIndex.loadedURL.newEntry(urls);
if (lEntry == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
continue;
}
// check if entry is well-formed
indexURLEntry.Components comp = lEntry.comp();
if ((comp.url() == null) || (lEntry.freshdate().getTime() <= freshdate)) {
yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls);
continue;
}
// check if the entry is blacklisted
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
blocked++;
continue;
}
// write entry to database
try {
sb.wordIndex.loadedURL.store(lEntry);
sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
received++;
} catch (IOException e) {
e.printStackTrace();
}
}

@ -117,7 +117,6 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
int sizeOfPage, // # of bytes of the page TODO: not needed any more
long lastmodified, // last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, // the entropy value
String language, // (guessed) language of document
char doctype, // type of document
int outlinksSame, // outlinks to same domain

@ -33,7 +33,6 @@ import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.yacy.yacySeedDB;
@ -66,7 +65,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
private static final int col_hitcount = 3;
private static final int col_language = 4;
private static final int col_doctype = 5;
private static final int col_localflag = 6;
//private static final int col_localflag = 6;
private static final int col_posintext = 7;
private static final int col_posinphrase = 8;
private static final int col_posofphrase = 9;
@ -77,6 +76,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
private kelondroRow.Entry entry;
/*
public indexRWIEntryOld(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
@ -91,7 +91,6 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
int sizeOfPage, // # of bytes of the page
long lastmodified, //*last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, //*the entropy value
String language, //*(guessed) language of document
char doctype, //*type of document
int outlinksSame, // outlinks to same domain
@ -107,7 +106,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
this.entry = urlEntryRow.newEntry();
this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_quality, quality);
this.entry.setCol(col_quality, 0);
this.entry.setCol(col_lastModified, lastmodified);
this.entry.setCol(col_hitcount, hitcount);
this.entry.setCol(col_language, language, null);
@ -121,7 +120,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
this.entry.setCol(col_phrasecount, phrasecount);
//System.out.println("DEBUG-NEWENTRY " + toPropertyForm());
}
*/
public indexRWIEntryOld(String urlHash, String code) {
// the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());

@ -24,7 +24,7 @@
package de.anomic.kelondro;
public class kelondroBitfield {
public class kelondroBitfield implements Cloneable {
// the bitfield implements a binary array. Such arrays may be exported in a base64-String
@ -55,6 +55,12 @@ public class kelondroBitfield {
}
}
public Object clone() {
kelondroBitfield theClone = new kelondroBitfield(new byte[this.bb.length]);
System.arraycopy(this.bb, 0, theClone.bb, 0, this.bb.length);
return theClone;
}
public void set(int pos, boolean value) {
assert (pos >= 0);
int slot = pos / 8;

@ -1392,7 +1392,7 @@ public class kelondroRecords {
USAGE.FREEC--;
// take link
if (USAGE.FREEH.index == NUL) {
System.out.println("INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records. Affected file: " + filename);
serverLog.logSevere("kelondroRecords/" + filename, "INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records.");
// try to heal..
USAGE.USEDC = USAGE.allCount() + 1;
USAGE.FREEC = 0;
@ -1402,10 +1402,17 @@ public class kelondroRecords {
//System.out.println("*DEBUG* ALLOCATED DELETED INDEX " + index);
// check for valid seek position
long seekp = seekpos(USAGE.FREEH);
if (seekp > entryFile.length()) throw new kelondroException("new Handle: seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
// read link to next element of FREEH chain
USAGE.FREEH.index = entryFile.readInt(seekp);
if (seekp > entryFile.length()) {
// this is a severe inconsistency. try to heal..
serverLog.logSevere("kelondroRecords/" + filename, "new Handle: lost " + USAGE.FREEC + " marked nodes; seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
index = USAGE.allCount(); // a place at the end of the file
USAGE.USEDC += USAGE.FREEC; // to avoid that non-empty records at the end are overwritten
USAGE.FREEC = 0; // discard all possible empty nodes
USAGE.FREEH.index = NUL;
} else {
// read link to next element of FREEH chain
USAGE.FREEH.index = entryFile.readInt(seekp);
}
}
USAGE.write();
}

@ -49,7 +49,6 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
@ -66,6 +65,8 @@ import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.index.indexRWIEntryNew;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
@ -114,20 +115,123 @@ public final class plasmaCondenser {
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
public int RESULT_SIMI_WORDS = -1;
public int RESULT_WORD_ENTROPHY = -1;
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public int RESULT_SIMI_SENTENCES = -1;
public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
public plasmaCondenser(plasmaParserDocument document) throws UnsupportedEncodingException {
public plasmaCondenser(plasmaParserDocument document, boolean addMedia) throws UnsupportedEncodingException {
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the approriate media flag
this(document.getText(), document.getCharset());
kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19
// construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainLongTitle
// phrase 2 is the MainShortTitle
// phrase 3 is the Document Abstract
// phrase 4 is the Document Author
// phrase 5 are the tags specified in document
// phrase 10 and above are the section headlines/titles (88 possible)
// phrase 98 is taken from the embedded anchor/hyperlinks description
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(document.getMainLongTitle(), 1, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getAbstract(), 3, indexRWIEntryNew.flag_app_descr, wflags);
// missing: author!
// missing: tags!
String[] titles = document.getSectionTitles();
for (int i = 0; 1 < titles.length; i++) {
insertTextToWords(titles[i], i + 10, indexRWIEntryNew.flag_app_emphasized, wflags);
}
// anchors
Iterator i = document.getAnchors().entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 98, indexRWIEntryNew.flag_app_url, wflags);
insertTextToWords((String) entry.getValue(), 98, indexRWIEntryNew.flag_app_url, wflags);
}
// audio
i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, wflags);
}
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, wflags);
}
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, wflags);
}
// images
i = document.getImages().iterator();
htmlFilterImageEntry ientry;
while (i.hasNext()) {
ientry = (htmlFilterImageEntry) i.next();
insertTextToWords((String) ientry.url().toNormalform(), 99, flag_cat_hasimage, wflags);
insertTextToWords((String) ientry.alt(), 99, flag_cat_hasimage, wflags);
}
// finally check all words for missing flag entry
i = words.entrySet().iterator();
wordStatProp wprop;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
wprop = (wordStatProp) entry.getValue();
if (wprop.flags == null) {
wprop.flags = (kelondroBitfield) wflags.clone();
words.put(entry.getKey(), wprop);
}
}
}
private void insertTextToWords(String text, int phrase, int flagpos, kelondroBitfield flagstemplate) {
String word;
wordStatProp wprop;
sievedWordsEnum wordenum;
try {
wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8", 3);
} catch (UnsupportedEncodingException e) {
return;
}
int pip = 0;
while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase();
wprop = (wordStatProp) words.get(word);
if (wprop == null) wprop = new wordStatProp(0, pip, phrase);
if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone();
wprop.numOfPhrase = 1;
wprop.posInPhrase = pip;
wprop.flags.set(flagpos, true);
words.put(word, wprop);
pip++;
}
}
public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException {
this(text, charset, 3, 2);
}
@ -174,18 +278,19 @@ public final class plasmaCondenser {
}
public Map words() {
// returns the words as wod/wordStatProp relation map
// returns the words as word/wordStatProp relation map
return words;
}
public static class wordStatProp {
// object carries statistics for words and sentences
public int count; // number of occurrences
public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
public int posInPhrase; //
public int numOfPhrase;
public HashSet hash; //
public int count; // number of occurrences
public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
public int posInPhrase; // position of word in phrase
public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
public HashSet hash; // a set of handles to all sentences where this word appears
public kelondroBitfield flags; // the flag bits for each word
public wordStatProp(int handle, int pip, int nop) {
this.count = 1;
@ -193,6 +298,7 @@ public final class plasmaCondenser {
this.posInPhrase = pip;
this.numOfPhrase = nop;
this.hash = new HashSet();
this.flags = null;
}
public void inc() {
@ -314,7 +420,7 @@ public final class plasmaCondenser {
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 1);
wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 100);
}
words.put(word, wsp);
// we now have the unique handle of the word, put it into the sentence:
@ -429,7 +535,6 @@ public final class plasmaCondenser {
this.RESULT_NUMB_WORDS = allwordcounter;
this.RESULT_DIFF_WORDS = wordHandleCount;
this.RESULT_SIMI_WORDS = words.size();
this.RESULT_WORD_ENTROPHY = (allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter);
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size();
@ -508,6 +613,7 @@ public final class plasmaCondenser {
return orderedSentences;
}
/*
public void writeMapToFile(File out) throws IOException {
Map.Entry entry;
String k;
@ -520,7 +626,7 @@ public final class plasmaCondenser {
// we reconstruct the word hashtable
// and sort the entries by the number of occurrences
// this structure is needed to print out a sorted list of words
TreeMap sortedWords = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/);
TreeMap sortedWords = new TreeMap(); //kelondroNaturalOrder.naturalOrder
it = words.entrySet().iterator(); // enumerates the keys in ascending order
while (it.hasNext()) {
entry = (Map.Entry) it.next();
@ -549,7 +655,7 @@ public final class plasmaCondenser {
}
writer.close();
}
*/
public final static boolean invisible(char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true;
@ -771,16 +877,22 @@ public final class plasmaCondenser {
}
public static Map getWords(InputStream input, String charset) throws UnsupportedEncodingException {
if (input == null) return null;
plasmaCondenser condenser = new plasmaCondenser(input, charset);
return condenser.words;
}
public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException {
// returns a word/wordStatProp relation map
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
return getWords(buffer, charset);
return new plasmaCondenser(buffer, charset, 2, 1).words();
}
public static Map getWords(String text) {
// returns a word/wordStatProp relation map
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text.getBytes());
try {
return new plasmaCondenser(buffer, "UTF-8", 2, 1).words();
} catch (UnsupportedEncodingException e) {
return null;
}
}
public static void main(String[] args) {

@ -220,6 +220,7 @@ public class plasmaParserDocument {
public Map getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return anchors;
}

@ -52,6 +52,7 @@ import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import de.anomic.http.httpHeader;
@ -255,27 +256,38 @@ public class plasmaSnippetCache {
try { resContent.close(); } catch (Exception e) {/* ignore this */}
}
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for URL " + url);
final Enumeration sentences = document.getSentences(pre);
document.close();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if (sentences == null) {
//System.out.println("found no sentences in url " + url);
return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
}
/* ===========================================================================
* COMPUTE SNIPPET
* =========================================================================== */
// we have found a parseable non-empty file: use the lines
line = computeSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
//System.out.println("loaded snippet for URL " + url + ": " + line);
// compute snippet from text
final Enumeration sentences = document.getSentences(pre);
if (sentences == null) return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
// compute snippet from media
String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
//String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
//String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
line = "";
if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
if (appline != null) line += (line.length() == 0) ? appline : "<br />" + appline;
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "<br />" + textline;
if (line == null) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found");
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
// finally store this snippet in our own cache
storeToCache(wordhashes, urlhash, line);
document.close();
return new Snippet(line, source, null);
}
@ -366,7 +378,32 @@ public class plasmaSnippetCache {
return (String) snippetsCache.get(key);
}
private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
private String computeMediaSnippet(Map media, Set queryhashes) {
Iterator i = media.entrySet().iterator();
Map.Entry entry;
String url, desc;
Set s;
String result = "";
while (i.hasNext()) {
entry = (Map.Entry) i.next();
url = (String) entry.getKey();
desc = (String) entry.getValue();
s = removeAppearanceHashes(url, queryhashes);
if (s.size() == 0) {
result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
continue;
}
s = removeAppearanceHashes(desc, s);
if (s.size() == 0) {
result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
continue;
}
}
if (result.length() == 0) return null;
return result.substring(6);
}
private String computeTextSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
try {
if (sentences == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
@ -404,20 +441,43 @@ public class plasmaSnippetCache {
shortLineLength = ((String) sb.get(i)).length();
}
}
// find a first result
String result = (String) sb.get(shortLineIndex);
// remove all hashes that appear in the result
hs = hashSentence(result);
String result = computeTextSnippet((String) sb.get(shortLineIndex), queryhashes, minLength, maxLength);
Set remaininghashes = removeAppearanceHashes(result, queryhashes);
if (remaininghashes.size() == 0) return result;
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
String nextSnippet = computeTextSnippet(sentences, remaininghashes, minLength, maxLength);
if (nextSnippet == null) return null;
return result + (" / " + nextSnippet);
} catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
return "";
}
}
private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) {
try {
if (sentence == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
Iterator j;
HashMap hs;
String hash;
// find all hashes that appear in the sentence
hs = hashSentence(sentence);
j = queryhashes.iterator();
Integer pos;
Set remaininghashes = new HashSet();
int p, minpos = result.length(), maxpos = -1;
int p, minpos = sentence.length(), maxpos = -1;
while (j.hasNext()) {
hash = (String) j.next();
pos = (Integer) hs.get(hash);
if (pos == null) {
remaininghashes.add(new String(hash));
} else {
if (pos != null) {
p = pos.intValue();
if (p > maxpos) maxpos = p;
if (p < minpos) minpos = p;
@ -425,51 +485,62 @@ public class plasmaSnippetCache {
}
// check result size
maxpos = maxpos + 10;
if (maxpos > result.length()) maxpos = result.length();
if (maxpos > sentence.length()) maxpos = sentence.length();
if (minpos < 0) minpos = 0;
// we have a result, but is it short enough?
if (maxpos - minpos + 10 > maxLength) {
// the string is too long, even if we cut at both ends
// so cut here in the middle of the string
int lenb = result.length();
result = result.substring(0, (minpos + 20 > result.length()) ? result.length() : minpos + 20).trim() +
int lenb = sentence.length();
sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() +
" [..] " +
result.substring((maxpos + 26 > result.length()) ? result.length() : maxpos + 26).trim();
maxpos = maxpos + lenb - result.length() + 6;
sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim();
maxpos = maxpos + lenb - sentence.length() + 6;
}
if (maxpos > maxLength) {
// the string is too long, even if we cut it at the end
// so cut it here at both ends at once
int newlen = maxpos - minpos + 10;
int around = (maxLength - newlen) / 2;
result = "[..] " + result.substring(minpos - around, ((maxpos + around) > result.length()) ? result.length() : (maxpos + around)).trim() + " [..]";
sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
minpos = around;
maxpos = result.length() - around - 5;
maxpos = sentence.length() - around - 5;
}
if (result.length() > maxLength) {
// trim result, 1st step (cut at right side)
result = result.substring(0, maxpos).trim() + " [..]";
if (sentence.length() > maxLength) {
// trim sentence, 1st step (cut at right side)
sentence = sentence.substring(0, maxpos).trim() + " [..]";
}
if (result.length() > maxLength) {
// trim result, 2nd step (cut at left side)
result = "[..] " + result.substring(minpos).trim();
if (sentence.length() > maxLength) {
// trim sentence, 2nd step (cut at left side)
sentence = "[..] " + sentence.substring(minpos).trim();
}
if (result.length() > maxLength) {
// trim result, 3rd step (cut in the middle)
result = result.substring(6, 20).trim() + " [..] " + result.substring(result.length() - 26, result.length() - 6).trim();
if (sentence.length() > maxLength) {
// trim sentence, 3rd step (cut in the middle)
sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
}
if (queryhashes.size() == 0) return result;
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
String nextSnippet = computeSnippet(sentences, remaininghashes, minLength, maxLength);
return result + ((nextSnippet == null) ? "" : (" / " + nextSnippet));
return sentence;
} catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
return "";
return null;
}
}
private Set removeAppearanceHashes(String sentence, Set queryhashes) {
// remove all hashes that appear in the sentence
if (sentence == null) return queryhashes;
HashMap hs = hashSentence(sentence);
Iterator j = queryhashes.iterator();
String hash;
Integer pos;
Set remaininghashes = new HashSet();
while (j.hasNext()) {
hash = (String) j.next();
pos = (Integer) hs.get(hash);
if (pos == null) {
remaininghashes.add(new String(hash));
}
}
return remaininghashes;
}
private HashMap hashSentence(String sentence) {

@ -1576,7 +1576,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
plasmaCondenser condenser = new plasmaCondenser(document);
plasmaCondenser condenser = new plasmaCondenser(document, true);
// generate citation reference
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
@ -1586,6 +1586,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
// create a new loaded URL db entry
long ldate = System.currentTimeMillis();
indexURLEntry newEntry = wordIndex.loadedURL.newEntry(
entry.url(), // URL
docDescription, // document description
@ -1594,7 +1595,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
"", // ETag
docDate, // modification date
new Date(), // loaded date
new Date(), // freshdate
new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula
referrerUrlHash, // referer hash
new byte[0], // md5
(int) entry.size(), // size
@ -1655,16 +1656,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
* STORE PAGE INDEX INTO WORD INDEX DB
* ======================================================================== */
words = wordIndex.addPageIndex(
entry.url(), // document url
urlHash, // document url hash
docDate, // document mod date
(int) entry.size(), // document size
document, // document content
condenser, // document condenser
entry.url(), // document url
urlHash, // document url hash
docDate, // document mod date
(int) entry.size(), // document size
document, // document content
condenser, // document condenser
plasmaURL.language(entry.url()), // document language
plasmaURL.docType(document.getMimeType()), // document type
ioLinks[0].intValue(), // outlinkSame
ioLinks[1].intValue() // outlinkOthers
ioLinks[0].intValue(), // outlinkSame
ioLinks[1].intValue() // outlinkOthers
);
} else {
/* ========================================================================
@ -1704,7 +1705,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
newEntry.size(),
docDate.getTime(),
System.currentTimeMillis(),
condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
ioLinks[0].intValue(),
@ -1749,7 +1749,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
tmpContainers = null;
}
} //end: SEND PAGE INDEX TO STORAGE PEER
storageEndTime = System.currentTimeMillis();
//increment number of indexed urls
@ -2253,7 +2254,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// get the word set
Set words = null;
try {
words = new plasmaCondenser(document).words().keySet();
words = new plasmaCondenser(document, true).words().keySet();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}

@ -251,22 +251,21 @@ public final class plasmaWordIndex implements indexRI {
// this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index entries
// iterate over all words
int wordCount = 0;
int urlLength = url.toString().length();
int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
// iterate over all words of context text
Iterator i = condenser.words().entrySet().iterator();
Map.Entry wentry;
String word;
indexRWIEntry ientry;
plasmaCondenser.wordStatProp wprop;
String wordHash;
int urlLength = url.toString().length();
int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
while (i.hasNext()) {
wentry = (Map.Entry) i.next();
word = (String) wentry.getKey();
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaCondenser.word2hash(word);
assert (wprop.flags != null);
ientry = new indexRWIEntryNew(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
wprop.count,
@ -279,16 +278,15 @@ public final class plasmaWordIndex implements indexRI {
size,
urlModified.getTime(),
System.currentTimeMillis(),
condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
outlinksSame, outlinksOther,
condenser.RESULT_FLAGS);
addEntry(wordHash, ientry, System.currentTimeMillis(), false);
wprop.flags);
addEntry(plasmaCondenser.word2hash(word), ientry, System.currentTimeMillis(), false);
wordCount++;
}
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
// condenser.getWords().size() + " words, flushed " + c + " entries");
return condenser.RESULT_SIMI_WORDS;
return wordCount;
}
public indexContainer getContainer(String wordHash, Set urlselection, long maxTime) {

Loading…
Cancel
Save