- enhanced parser: collection of audio, video, image and application links

- enhanced condenser: better handling of utf-8 and pre-formatted texts


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3017 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 984285bdd6
commit ceb9e3aa17

@ -124,11 +124,14 @@ public class CacheAdmin_p {
info.append("<b>TITLE:</b><br>").append(scraper.getTitle()).append("<br>").append("<br>")
.append("<b>SECTION HEADLINES:</b><br>").append(formatTitles(document.getSectionTitles())).append("<br>")
.append("<b>HREF:</b><br>").append(formatAnchor(document.getHyperlinks())).append("<br>")
.append("<b>MEDIA:</b><br>").append(formatAnchor(document.getMedialinks())).append("<br>")
.append("<b>IMAGE:</b><br>").append(formatAnchor(document.getImagelinks())).append("<br>")
.append("<b>AUDIO:</b><br>").append(formatAnchor(document.getAudiolinks())).append("<br>")
.append("<b>VIDEO:</b><br>").append(formatAnchor(document.getVideolinks())).append("<br>")
.append("<b>APPS:</b><br>").append(formatAnchor(document.getApplinks())).append("<br>")
.append("<b>EMAIL:</b><br>").append(formatAnchor(document.getEmaillinks())).append("<br>")
.append("<b>TEXT:</b><br><span class=\"small\">").append(new String(scraper.getText())).append("</span><br>")
.append("<b>LINES:</b><br><span class=\"small\">");
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
final Enumeration sentences = document.getSentences(false);
if (sentences != null) while (sentences.hasMoreElements()) {
info.append((String) sentences.nextElement()).append("<br>");
}

@ -57,6 +57,7 @@ import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.index.indexURLEntry;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
@ -99,7 +100,8 @@ public class ViewFile {
URL url = null;
String descr = "";
int wordCount = 0;
int size = 0;
int size = 0;
boolean pre = false;
// getting the url hash from which the content should be loaded
String urlHash = post.get("urlHash","");
@ -124,6 +126,7 @@ public class ViewFile {
descr = comp.descr();
urlEntry.wordCount();
size = urlEntry.size();
pre = urlEntry.flags().get(plasmaCondenser.flag_cat_indexof);
}
// alternatively, get the url simply from a url String
@ -140,6 +143,7 @@ public class ViewFile {
// define an url by post parameter
url = new URL(urlString);
pre = post.get("pre", "false").equals("true");
} catch (MalformedURLException e) {}
@ -303,14 +307,13 @@ public class ViewFile {
prop.put("viewMode_parsedText", content);
} else {
prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
final Enumeration sentences = document.getSentences(pre);
boolean dark = true;
int i = 0;
if (sentences != null)
while (sentences.hasMoreElements()) {
String currentSentence = wikiCode
.replaceHTML((String) sentences.nextElement());
String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());
// Search word highlighting
String words = post.get("words", null);

@ -360,7 +360,7 @@ public class dir {
public static void indexPhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr, byte[] md5) {
try {
final URL url = new URL(urlstring);
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()), "UTF-8");
final indexURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
url,
"YaCyShare: " + descr,
@ -395,7 +395,7 @@ public class dir {
public static void deletePhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) {
try {
final String urlhash = plasmaURL.urlHash(new URL(urlstring));
final Iterator words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"));
final Iterator words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"), "UTF-8");
Map.Entry entry;
while (words.hasNext()) {
entry = (Map.Entry) words.next();

@ -28,6 +28,9 @@ public class snippet {
// if 'remove' is set to true, then RWI references to URLs that do not have the snippet are removed
boolean remove = post.get("remove", "false").equals("true");
// boolean line_end_with_punctuation
boolean pre = post.get("pre", "false").equals("true");
String querystring = post.get("search", "").trim();
if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
@ -40,10 +43,9 @@ public class snippet {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
}
// do the search
Set queryHashes = plasmaCondenser.words2hashes(query);
plasmaSnippetCache.Snippet snippet = switchboard.snippetCache.retrieveSnippet(url, queryHashes, true, 260, 10000);
// find snippet
Set queryHashes = plasmaCondenser.words2hashes(query);
plasmaSnippetCache.Snippet snippet = switchboard.snippetCache.retrieveSnippet(url, queryHashes, true, pre, 260, 10000);
prop.put("status",snippet.getSource());
if (snippet.getSource() < 11) {
//prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");

@ -56,6 +56,7 @@ import de.anomic.kelondro.kelondroBitfield;
import de.anomic.index.indexContainer;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
@ -256,7 +257,7 @@ public final class search {
while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
urlentry = (indexURLEntry) acc.nextElement();
if (includesnippet) {
snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000);
snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), 260, 1000);
} else {
snippet = null;
}

@ -31,6 +31,7 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.index.indexRWIEntry;
@ -49,6 +50,7 @@ public interface indexURLEntry {
public int size();
public int wordCount();
public String snippet();
public kelondroBitfield flags();
public indexRWIEntry word();
public boolean isOlder(indexURLEntry other);
public String toString(String snippet);

@ -35,6 +35,7 @@ import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.logging.serverLog;
import de.anomic.tools.crypt;
@ -262,6 +263,10 @@ public class indexURLEntryOld implements indexURLEntry {
public int wordCount() {
return wordCount;
}
public kelondroBitfield flags() {
return plasmaSearchQuery.empty_constraint;
}
public String snippet() {
// the snippet may appear here if the url was transported in a remote search

@ -107,7 +107,7 @@ public final class plasmaCondenser {
private int wordminsize;
private int wordcut;
public int RESULT_NUMB_TEXT_BYTES = -1;
//public int RESULT_NUMB_TEXT_BYTES = -1;
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
public int RESULT_SIMI_WORDS = -1;
@ -117,17 +117,17 @@ public final class plasmaCondenser {
public int RESULT_SIMI_SENTENCES = -1;
public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
public plasmaCondenser(InputStream text) {
this(text, 3, 2);
public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException {
this(text, charset, 3, 2);
}
public plasmaCondenser(InputStream text, int wordminsize, int wordcut) {
public plasmaCondenser(InputStream text, String charset, int wordminsize, int wordcut) throws UnsupportedEncodingException {
this.wordminsize = wordminsize;
this.wordcut = wordcut;
// analysis = new Properties();
words = new TreeMap();
sentences = new HashMap();
createCondensement(text);
createCondensement(text, charset);
}
// create a word hash
@ -225,7 +225,7 @@ public final class plasmaCondenser {
return s;
}
private void createCondensement(InputStream is) {
private void createCondensement(InputStream is, String charset) throws UnsupportedEncodingException {
words = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/);
sentences = new HashMap();
@ -244,10 +244,10 @@ public final class plasmaCondenser {
int idx;
int wordInSentenceCounter = 1;
Iterator it, it1;
boolean comb_indexof = false, comb_lastmodified = false, last_last = false, last_index = false;
boolean comb_indexof = false, last_last = false, last_index = false;
// read source
sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize);
sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize);
while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
//System.out.println("PARSED-WORD " + word);
@ -285,7 +285,10 @@ public final class plasmaCondenser {
wordInSentenceCounter = 1;
} else {
// check index.of detection
if ((last_last) && (word.equals("modified"))) comb_lastmodified = true;
if ((last_last) && (comb_indexof) && (word.equals("modified"))) {
this.RESULT_FLAGS.set(flag_cat_indexof, true);
wordenum.pre(true); // parse lines as they come with CRLF
}
if ((last_index) && (word.equals("of"))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");
@ -412,7 +415,7 @@ public final class plasmaCondenser {
}
// store result
this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
this.RESULT_NUMB_WORDS = allwordcounter;
this.RESULT_DIFF_WORDS = wordHandleCount;
this.RESULT_SIMI_WORDS = words.size();
@ -420,7 +423,6 @@ public final class plasmaCondenser {
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size();
this.RESULT_FLAGS.set(flag_cat_indexof, comb_indexof && comb_lastmodified);
}
public void print() {
@ -544,10 +546,9 @@ public final class plasmaCondenser {
return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
}
public static Enumeration wordTokenizer(String s, int minLength) {
public static Enumeration wordTokenizer(String s, String charset, int minLength) {
try {
// TODO: Bugfix for UTF-8 needed
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), minLength);
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), charset, minLength);
} catch (Exception e) {
return null;
}
@ -560,13 +561,17 @@ public final class plasmaCondenser {
unsievedWordsEnum e;
int ml;
public sievedWordsEnum(InputStream is, int minLength) {
e = new unsievedWordsEnum(is);
public sievedWordsEnum(InputStream is, String charset, int minLength) throws UnsupportedEncodingException {
e = new unsievedWordsEnum(is, charset);
buffer = nextElement0();
ml = minLength;
}
private Object nextElement0() {
public void pre(boolean x) {
e.pre(x);
}
private Object nextElement0() {
String s;
char c;
loop: while (e.hasMoreElements()) {
@ -596,23 +601,24 @@ public final class plasmaCondenser {
return r;
}
public int count() {
return e.count();
}
}
private static class unsievedWordsEnum implements Enumeration {
Object buffer = null;
linesFromFileEnum e;
sentencesFromInputStreamEnum e;
String s;
public unsievedWordsEnum(InputStream is) {
e = new linesFromFileEnum(is);
public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is, charset);
s = "";
buffer = nextElement0();
}
public void pre(boolean x) {
e.pre(x);
}
private Object nextElement0() {
String r;
StringBuffer sb;
@ -656,66 +662,9 @@ public final class plasmaCondenser {
return r;
}
public int count() {
return e.count();
}
}
private static class linesFromFileEnum implements Enumeration {
// read in lines from a given input stream
// every line starting with a '#' is treated as a comment.
Object buffer = null;
BufferedReader raf;
int counter = 0;
public linesFromFileEnum(InputStream is) {
raf = new BufferedReader(new InputStreamReader(is)); // TODO: bugfix needed for UTF-8, use charset for reader
buffer = nextElement0();
counter = 0;
}
private Object nextElement0() {
try {
String s;
while (true) {
s = raf.readLine();
if (s == null) {
raf.close();
return null;
}
if (!(s.startsWith("#"))) return s;
}
} catch (IOException e) {
try {
raf.close();
} catch (Exception ee) {
}
return null;
}
}
public boolean hasMoreElements() {
return buffer != null;
}
public Object nextElement() {
if (buffer == null) {
return null;
} else {
counter = counter + ((String) buffer).length() + 1;
Object r = buffer;
buffer = nextElement0();
return r;
}
}
public int count() {
return counter;
}
}
public static Enumeration sentencesFromInputStream(InputStream is, String charset) {
public static sentencesFromInputStreamEnum sentencesFromInputStream(InputStream is, String charset) {
try {
return new sentencesFromInputStreamEnum(is, charset);
} catch (UnsupportedEncodingException e) {
@ -723,23 +672,29 @@ public final class plasmaCondenser {
}
}
private static class sentencesFromInputStreamEnum implements Enumeration {
public static class sentencesFromInputStreamEnum implements Enumeration {
// read sentences from a given input stream
// this enumerates String objects
Object buffer = null;
BufferedReader raf;
int counter = 0;
boolean pre = false;
public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException {
raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
buffer = nextElement0();
counter = 0;
pre = false;
}
public void pre(boolean x) {
this.pre = x;
}
private Object nextElement0() {
try {
String s = readSentence(raf);
String s = readSentence(raf, pre);
//System.out.println(" SENTENCE='" + s + "'"); // DEBUG
if (s == null) {
raf.close();
@ -775,7 +730,7 @@ public final class plasmaCondenser {
}
}
static String readSentence(Reader reader) throws IOException {
static String readSentence(Reader reader, boolean pre) throws IOException {
StringBuffer s = new StringBuffer();
int nextChar;
char c;
@ -789,7 +744,11 @@ public final class plasmaCondenser {
}
c = (char) nextChar;
s.append(c);
if (htmlFilterContentScraper.punctuation(c)) break;
if (pre) {
if ((c == (char) 10) || (c == (char) 13)) break;
} else {
if (htmlFilterContentScraper.punctuation(c)) break;
}
}
// replace line endings and tabs by blanks
@ -802,16 +761,16 @@ public final class plasmaCondenser {
}
public static Iterator getWords(InputStream input) {
public static Iterator getWords(InputStream input, String charset) throws UnsupportedEncodingException {
if (input == null) return null;
plasmaCondenser condenser = new plasmaCondenser(input);
plasmaCondenser condenser = new plasmaCondenser(input, charset);
return condenser.words();
}
public static Iterator getWords(byte[] text) {
public static Iterator getWords(byte[] text, String charset) throws UnsupportedEncodingException {
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
return getWords(buffer);
return getWords(buffer, charset);
}
public static void main(String[] args) {

@ -139,9 +139,12 @@ public final class plasmaParser {
private static final HashSet mediaExtSet = new HashSet();
/**
* A list of image extensions that should be handleable by image viewer apps
* A list of image, audio, video and application extensions
*/
private static final HashSet imageExtSet = new HashSet();
private static final HashSet audioExtSet = new HashSet();
private static final HashSet videoExtSet = new HashSet();
private static final HashSet appsExtSet = new HashSet();
/**
* This {@link FilenameFilter} is used to find all classes based on there filenames
@ -169,17 +172,23 @@ public final class plasmaParser {
* @see #initMediaExt(String)
*/
static {
String apps = "sit,hqx,img,dmg,exe,com,bat,sh";
String audio = "mp2,mp3,ogg,aac,aif,aiff,wav";
String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v";
String image = "jpg,jpeg,jpe,gif,png";
initMediaExt(extString2extList(
"sit,hqx,img,dmg,exe,com,bat,sh" + // application container
"tar,gz,bz2,arj,zip,rar," + // archive formats
"ps,xls,ppt,asf," + // text formats without support
"mp3,ogg,aac," + // audio formats
"swf,avi,wmv,rm,mov,mpg,mpeg,ram," + // video formats
"jpg,jpeg,jpe,gif,png" // image formats
));
initImageExt(extString2extList(
"jpg,jpeg,jpe,gif,png" // image formats
apps + "," + // application container
"tar,gz,bz2,arj,zip,rar," + // archive formats
"ps,xls,ppt,asf," + // text formats without support
audio + "," + // audio formats
video + "," + // video formats
image // image formats
));
initImageExt(extString2extList(image)); // image formats
initAudioExt(extString2extList(audio)); // audio formats
initVideoExt(extString2extList(video)); // video formats
initAppsExt(extString2extList(apps)); // application formats
/* ===================================================
* initializing the parser object pool
@ -272,6 +281,27 @@ public final class plasmaParser {
}
}
public static void initAudioExt(List audioExtList) {
synchronized (audioExtSet) {
audioExtSet.clear();
audioExtSet.addAll(audioExtList);
}
}
public static void initVideoExt(List videoExtList) {
synchronized (videoExtSet) {
videoExtSet.clear();
videoExtSet.addAll(videoExtList);
}
}
public static void initAppsExt(List appsExtList) {
synchronized (appsExtSet) {
appsExtSet.clear();
appsExtSet.addAll(appsExtList);
}
}
public static String getMediaExtList() {
synchronized (mediaExtSet) {
return mediaExtSet.toString();
@ -343,6 +373,27 @@ public final class plasmaParser {
}
}
public static boolean audioExtContains(String audioExt) {
if (audioExt == null) return false;
synchronized (audioExtSet) {
return audioExtSet.contains(audioExt.trim().toLowerCase());
}
}
public static boolean videoExtContains(String videoExt) {
if (videoExt == null) return false;
synchronized (videoExtSet) {
return videoExtSet.contains(videoExt.trim().toLowerCase());
}
}
public static boolean appsExtContains(String appsExt) {
if (appsExt == null) return false;
synchronized (appsExtSet) {
return appsExtSet.contains(appsExt.trim().toLowerCase());
}
}
public static String getRealCharsetEncoding(String encoding) {
if ((encoding == null) || (encoding.length() == 0)) return "ISO-8859-1";
@ -887,7 +938,7 @@ public final class plasmaParser {
System.out.println(document.getMainLongTitle());
// found text
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
final Enumeration sentences = document.getSentences(false);
int i = 0;
if (sentences != null) while (sentences.hasMoreElements()) {
System.out.print("line " + i + ": ");

@ -75,8 +75,7 @@ public class plasmaParserDocument {
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
Map hyperlinks;
Map medialinks;
Map hyperlinks, audiolinks, videolinks, imagelinks, applinks;
Map emaillinks;
plasmaCondenser condenser;
boolean resorted;
@ -98,7 +97,10 @@ public class plasmaParserDocument {
this.anchors = (anchors==null)?new HashMap(0):anchors;
this.images = (images==null)?new TreeSet():images;
this.hyperlinks = null;
this.medialinks = null;
this.audiolinks = null;
this.videolinks = null;
this.imagelinks = null;
this.applinks = null;
this.emaillinks = null;
this.condenser = null;
this.resorted = false;
@ -121,7 +123,10 @@ public class plasmaParserDocument {
this.anchors = (anchors==null)?new HashMap(0):anchors;
this.images = (images==null)?new TreeSet():images;
this.hyperlinks = null;
this.medialinks = null;
this.audiolinks = null;
this.videolinks = null;
this.imagelinks = null;
this.applinks = null;
this.emaillinks = null;
this.condenser = null;
this.resorted = false;
@ -190,9 +195,11 @@ public class plasmaParserDocument {
return -1;
}
public Enumeration getSentences(String charset) {
public Enumeration getSentences(boolean pre) {
if (this.text == null) return null;
return plasmaCondenser.sentencesFromInputStream(getText(), charset);
plasmaCondenser.sentencesFromInputStreamEnum e = plasmaCondenser.sentencesFromInputStream(getText(), this.charset);
e.pre(pre);
return e;
}
public String getKeywords(char separator) {
@ -232,10 +239,24 @@ public class plasmaParserDocument {
return hyperlinks;
}
public Map getMedialinks() {
// this is partly subset of getAnchor and getImage: all non-hyperrefs
public Map getAudiolinks() {
if (!resorted) resortLinks();
return medialinks;
return this.audiolinks;
}
public Map getVideolinks() {
if (!resorted) resortLinks();
return this.videolinks;
}
public Map getImagelinks() {
if (!resorted) resortLinks();
return this.imagelinks;
}
public Map getApplinks() {
if (!resorted) resortLinks();
return this.applinks;
}
public Map getEmaillinks() {
@ -248,69 +269,70 @@ public class plasmaParserDocument {
// extract hyperlinks, medialinks and emaillinks from anchorlinks
Iterator i;
String url;
URL url;
String u;
int extpos, qpos;
String ext = null;
i = anchors.entrySet().iterator();
hyperlinks = new HashMap();
medialinks = new HashMap();
imagelinks = new HashMap();
videolinks = new HashMap();
audiolinks = new HashMap();
applinks = new HashMap();
emaillinks = new HashMap();
TreeSet collectedImages = new TreeSet(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
url = (String) entry.getKey();
if ((url != null) && (url.startsWith("mailto:"))) {
emaillinks.put(url.substring(7), entry.getValue());
u = (String) entry.getKey();
if ((u != null) && (u.startsWith("mailto:"))) {
emaillinks.put(u.substring(7), entry.getValue());
} else {
extpos = url.lastIndexOf(".");
String normal;
extpos = u.lastIndexOf(".");
if (extpos > 0) {
if (((qpos = url.indexOf("?")) >= 0) && (qpos > extpos)) {
ext = url.substring(extpos, qpos).toLowerCase();
if (((qpos = u.indexOf("?")) >= 0) && (qpos > extpos)) {
ext = u.substring(extpos + 1, qpos).toLowerCase();
} else {
ext = url.substring(extpos).toLowerCase();
ext = u.substring(extpos + 1).toLowerCase();
}
try {normal = new URL(url).toNormalform();} catch (MalformedURLException e1) {
normal = null;
}
if (normal != null) { //TODO: extension function is not correct
if (plasmaParser.mediaExtContains(ext.substring(1))) {
try {
url = new URL(u);
u = url.toNormalform();
if (plasmaParser.mediaExtContains(ext)) {
// this is not a normal anchor, its a media link
medialinks.put(normal, entry.getValue());
if (plasmaParser.imageExtContains(ext)) {
imagelinks.put(u, entry.getValue());
collectedImages.add(new htmlFilterImageEntry(url, "", -1, -1));
}
else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, entry.getValue());
else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, entry.getValue());
else if (plasmaParser.appsExtContains(ext)) applinks.put(u, entry.getValue());
} else {
hyperlinks.put(normal, entry.getValue());
}
if (plasmaParser.imageExtContains(ext.substring(1))) {
try {
collectedImages.add(new htmlFilterImageEntry(new URL(normal), "", -1, -1));
} catch (MalformedURLException e) {}
hyperlinks.put(u, entry.getValue());
}
} catch (MalformedURLException e1) {
}
}
}
}
// add the images to the medialinks
i = images.iterator();
String normal;
htmlFilterImageEntry iEntry;
while (i.hasNext()) {
iEntry = (htmlFilterImageEntry) i.next();
normal = iEntry.url().toNormalform();
if (normal != null) medialinks.put(normal, iEntry.alt()); // avoid NullPointerException
}
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));
hyperlinks.putAll(plasmaParser.allReflinks(medialinks));
hyperlinks.putAll(plasmaParser.allReflinks(imagelinks));
hyperlinks.putAll(plasmaParser.allReflinks(audiolinks));
hyperlinks.putAll(plasmaParser.allReflinks(videolinks));
hyperlinks.putAll(plasmaParser.allReflinks(applinks));
hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));
hyperlinks.putAll(plasmaParser.allSubpaths(medialinks));
hyperlinks.putAll(plasmaParser.allSubpaths(imagelinks));
hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks));
hyperlinks.putAll(plasmaParser.allSubpaths(videolinks));
hyperlinks.putAll(plasmaParser.allSubpaths(applinks));
// finally add image links that we collected from the anchors to the image map
i = collectedImages.iterator();
htmlFilterImageEntry iEntry;
while (i.hasNext()) {
iEntry = (htmlFilterImageEntry) i.next();
if (!images.contains(iEntry)) images.add(iEntry);

@ -167,13 +167,11 @@ public final class plasmaSearchResult {
Iterator i = pageAcc.entrySet().iterator();
HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation
Map.Entry entry;
String path = null;
// first scan all entries and find all urls that are referenced
while (i.hasNext()) {
entry = (Map.Entry) i.next();
path = urlPath(((indexURLEntry) entry.getValue()).comp().url());
paths.put(path, entry.getKey());
paths.put(((indexURLEntry) entry.getValue()).comp().url().toNormalform(), entry.getKey());
//if (path != null) path = shortenPath(path);
//if (path != null) paths.put(path, entry.getKey());
}
@ -183,8 +181,7 @@ public final class plasmaSearchResult {
String shorten;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
path = urlPath(((indexURLEntry) entry.getValue()).comp().url());
shorten = shortenPath(path);
shorten = shortenPath(((indexURLEntry) entry.getValue()).comp().url().toNormalform());
// scan all subpaths of the url
while (shorten != null) {
if (pageAcc.size() <= query.wantedResults) break;
@ -206,7 +203,7 @@ public final class plasmaSearchResult {
if (pos < 0) return null;
return path.substring(0, pos);
}
/*
private static String urlPath(URL url) {
String port = ((url.getPort() < 0) ? "" : ":" + url.getPort());
String path = url.getPath();
@ -217,7 +214,7 @@ public final class plasmaSearchResult {
}
return url.getHost() + port + path;
}
*/
public Object[] getReferences(int count) {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
@ -260,7 +257,7 @@ public final class plasmaSearchResult {
String hash, fill;
String[] paths1 = new String[urls.length]; for (int i = 0; i < urls.length; i++) {
fill = ""; for (int j = 0; j < 35 - urls[i].toString().length(); j++) fill +=" ";
paths1[i] = urlPath(urls[i]);
paths1[i] = urls[i].toNormalform();
hash = plasmaURL.urlHash(urls[i]);
System.out.println("paths1[" + urls[i] + fill +"] = " + hash + ", typeID=" + plasmaURL.flagTypeID(hash) + ", tldID=" + plasmaURL.flagTLDID(hash) + ", lengthID=" + plasmaURL.flagLengthID(hash) + " / " + paths1[i]);
}

@ -180,7 +180,7 @@ public class plasmaSnippetCache {
return retrieveFromCache(hashes, plasmaURL.urlHash(url)) != null;
}
public Snippet retrieveSnippet(URL url, Set queryhashes, boolean fetchOnline, int snippetMaxLength, int timeout) {
public Snippet retrieveSnippet(URL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout) {
// heise = "0OQUNU3JSs05"
if (queryhashes.size() == 0) {
//System.out.println("found no queryhashes for URL retrieve " + url);
@ -257,7 +257,7 @@ public class plasmaSnippetCache {
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for URL " + url);
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
final Enumeration sentences = document.getSentences(pre);
document.close();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if (sentences == null) {
@ -475,7 +475,7 @@ public class plasmaSnippetCache {
private HashMap hashSentence(String sentence) {
// generates a word-wordPos mapping
HashMap map = new HashMap();
Enumeration words = plasmaCondenser.wordTokenizer(sentence, 0);
Enumeration words = plasmaCondenser.wordTokenizer(sentence, "UTF-8", 0);
int pos = 0;
String word;
while (words.hasMoreElements()) {
@ -640,7 +640,7 @@ public class plasmaSnippetCache {
urlstring = comp.url().toNormalform();
if ((urlstring.matches(urlmask)) &&
(!(existsInCache(comp.url(), queryhashes)))) {
new Fetcher(comp.url(), queryhashes, (int) maxTime).start();
new Fetcher(comp.url(), queryhashes, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), (int) maxTime).start();
i++;
}
}
@ -650,15 +650,17 @@ public class plasmaSnippetCache {
URL url;
Set queryhashes;
int timeout;
public Fetcher(URL url, Set queryhashes, int timeout) {
boolean pre;
public Fetcher(URL url, Set queryhashes, boolean pre, int timeout) {
if (url.getHost().endsWith(".yacyh")) return;
this.url = url;
this.queryhashes = queryhashes;
this.timeout = timeout;
this.pre = pre;
}
public void run() {
log.logFine("snippetFetcher: try to get URL " + url);
plasmaSnippetCache.Snippet snippet = retrieveSnippet(url, queryhashes, true, 260, timeout);
plasmaSnippetCache.Snippet snippet = retrieveSnippet(url, queryhashes, true, pre, 260, timeout);
if (snippet.line == null)
log.logFine("snippetFetcher: cannot get URL " + url + ". error(" + snippet.source + "): " + snippet.error);
else

@ -106,6 +106,7 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.net.InetAddress;
import java.net.MalformedURLException;
@ -1564,10 +1565,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
plasmaCondenser condenser = new plasmaCondenser(document.getText());
plasmaCondenser condenser = new plasmaCondenser(document.getText(), document.charset);
// generate citation reference
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser);
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
try {
// check for interruption
@ -1575,22 +1576,27 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// create a new loaded URL db entry
indexURLEntry newEntry = urlPool.loadedURL.newEntry(
entry.url(), // URL
docDescription, // document description
"", // author
"", // tags
"", // ETag
docDate, // modification date
new Date(), // loaded date
new Date(), // freshdate
referrerUrlHash, // referer hash
new byte[0], // md5
(int) entry.size(), // size
condenser.RESULT_NUMB_WORDS, // word count
plasmaURL.docType(document.getMimeType()), // doctype
condenser.RESULT_FLAGS, // flags
plasmaURL.language(entry.url()), // language
0,0,0,0,0,0
entry.url(), // URL
docDescription, // document description
"", // author
"", // tags
"", // ETag
docDate, // modification date
new Date(), // loaded date
new Date(), // freshdate
referrerUrlHash, // referer hash
new byte[0], // md5
(int) entry.size(), // size
condenser.RESULT_NUMB_WORDS, // word count
plasmaURL.docType(document.getMimeType()), // doctype
condenser.RESULT_FLAGS, // flags
plasmaURL.language(entry.url()), // language
ioLinks[0].intValue(), // llocal
ioLinks[1].intValue(), // lother
document.audiolinks.size(), // laudio
document.imagelinks.size(), // limage
document.videolinks.size(), // lvideo
document.applinks.size() // lapp
);
/* ========================================================================
* STORE URL TO LOADED-URL-DB
@ -1598,7 +1604,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
urlPool.loadedURL.store(newEntry);
urlPool.loadedURL.stack(
newEntry, // loaded url db entry
initiatorPeerHash, // initiator peer hash
initiatorPeerHash, // initiator peer hash
yacyCore.seedDB.mySeed.hash, // executor peer hash
processCase // process case
);
@ -2094,7 +2100,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
filename = comp.url().getFile();
if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here
removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes()));
removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8"));
urlPool.loadedURL.remove(urlentry.hash()); // clean up
continue; // next result
}
@ -2121,7 +2127,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL wordURL;
if (urlstring.matches(query.urlMask)) { //.* is default
if (includeSnippets) {
snippet = snippetCache.retrieveSnippet(comp.url(), query.queryHashes, false, 260, 1000);
snippet = snippetCache.retrieveSnippet(comp.url(), query.queryHashes, false, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), 260, 1000);
} else {
snippet = null;
}
@ -2237,10 +2243,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
InputStream docBodyInputStream = document.getText();
// getting word iterator
Iterator witer = plasmaCondenser.getWords(docBodyInputStream);
Iterator witer = null;
try {
witer = plasmaCondenser.getWords(docBodyInputStream, document.charset);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
// delete all word references
int count = removeReferences(urlhash, witer);
int count = 0;
if (witer != null) count = removeReferences(urlhash, witer);
// finally delete the url entry itself
urlPool.loadedURL.remove(urlhash);

@ -420,8 +420,7 @@ public class plasmaURL {
tld = host.substring(p + 1);
dom = host.substring(0, p);
}
Integer ID = (serverCore.isNotLocal(tld)) ? (Integer) TLDID.get(tld)
: null; // identify local addresses
Integer ID = (serverCore.isNotLocal(tld)) ? (Integer) TLDID.get(tld) : null; // identify local addresses
int id = (ID == null) ? 7 : ID.intValue(); // local addresses are flagged with id=7
boolean isHTTP = url.getProtocol().equals("http");
p = dom.lastIndexOf('.'); // locate subdomain

@ -414,19 +414,21 @@ public final class serverCore extends serverAbstractThread implements serverThre
public static boolean isNotLocal(String ip) {
// generate ip address if ip is given by host
assert (ip != null);
// check local ip addresses
if ((ip.equals("localhost")) ||
(ip.startsWith("127")) ||
(ip.startsWith("192.168")) ||
(ip.startsWith("10."))
) return false;
// make a dns resolve
final InetAddress clientAddress = httpc.dnsResolve(ip);
if (clientAddress != null) {
if ((clientAddress.isAnyLocalAddress()) || (clientAddress.isLoopbackAddress())) return false;
if (ip.charAt(0) > '9') ip = clientAddress.getHostAddress();
}
// check local ip addresses
if ((ip.equals("localhost")) ||
(ip.startsWith("127")) ||
(ip.startsWith("192.168")) ||
(ip.startsWith("10."))
) return false;
// finally check if there are other local IP adresses that are not in the standard IP range
for (int i = 0; i < localAddresses.length; i++) {
if (localAddresses[i].equals(clientAddress)) return false;

@ -188,7 +188,7 @@ parseableMimeTypes.URLREDIRECTOR=
# a comma-separated list of extensions that denote media file formats
# this is important to recognize <a href> - tags as not-html reference
# These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
mediaExt=7z,ace,arj,asf,asx,avi,bin,bz2,css,db,dcm,deb,doc,dll,dmg,gif,gz,hqx,ico,img,iso,jar,jpe,jpg,jpeg,lx,lxl,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,scr,sit,so,swf,sxc,sxd,sxi,sxw,tar,tbz,tgz,torrent,war,wmv,xcf,xls,zip
mediaExt=7z,ace,aif,aiff,arj,asf,asx,avi,bin,bz2,css,db,dcm,deb,doc,dll,dmg,gif,gz,hqx,ico,img,iso,jar,jpe,jpg,jpeg,lx,lxl,m4v,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,scr,sit,so,swf,sxc,sxd,sxi,sxw,tar,tbz,tgz,torrent,war,wav,wmv,xcf,xls,zip
parseableExt=html,htm,txt,php,shtml,asp,aspx,jsp
# Promotion Strings

Loading…
Cancel
Save