From 87a8747ce338a146d295b5af8e15bd91fa50ca47 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 25 Feb 2008 14:08:15 +0000 Subject: [PATCH] - enhanced recognition, parsing, management and double-occurrence-handling of image tags - enhanced text parser (condenser): found and eliminated bad code parts; increase of speed - added handling of image preview using the image cache from HTCACHE - some other minor changes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4507 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 5 +- htroot/ViewFile.java | 6 +-- htroot/ViewImage.java | 46 +++++++++---------- htroot/yacysearchitem.html | 2 +- htroot/yacysearchitem.java | 2 + .../htmlFilter/htmlFilterContentScraper.java | 46 ++++++++++++++++--- source/de/anomic/http/httpdFileHandler.java | 1 + .../anomic/plasma/parser/rss/rssParser.java | 10 ++-- .../anomic/plasma/parser/tar/tarParser.java | 6 +-- .../anomic/plasma/parser/zip/zipParser.java | 6 +-- source/de/anomic/plasma/plasmaCondenser.java | 41 ++++++++++------- .../de/anomic/plasma/plasmaCrawlBalancer.java | 5 +- source/de/anomic/plasma/plasmaCrawlNURL.java | 30 +++++++++--- source/de/anomic/plasma/plasmaParser.java | 5 +- .../anomic/plasma/plasmaParserDocument.java | 32 ++++++------- .../de/anomic/plasma/plasmaSearchImages.java | 26 +++-------- .../de/anomic/plasma/plasmaSnippetCache.java | 3 +- 17 files changed, 161 insertions(+), 111 deletions(-) diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index d825be9b4..bc7898de9 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -54,6 +54,7 @@ import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.io.Writer; +import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.TreeSet; @@ -313,8 +314,8 @@ public class CacheAdmin_p { prop.put("info_type_use." + extension, (i == 0) ? 0 : 1); } - private static void formatImageAnchor(serverObjects prop, TreeSet anchor) { - final Iterator iter = anchor.iterator(); + private static void formatImageAnchor(serverObjects prop, HashMap anchor) { + final Iterator iter = anchor.values().iterator(); htmlFilterImageEntry ie; prop.put("info_type_use.images_images", anchor.size()); int i = 0; diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 2ad6fa099..faa66f3ec 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -49,9 +49,9 @@ import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URLDecoder; +import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import java.util.TreeSet; import de.anomic.data.htmlTools; import de.anomic.htmlFilter.htmlFilterImageEntry; @@ -339,8 +339,8 @@ public class ViewFile { i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0)); dark = (i % 2 == 0); - TreeSet ts = document.getImages(); - Iterator tsi = ts.iterator(); + HashMap ts = document.getImages(); + Iterator tsi = ts.values().iterator(); htmlFilterImageEntry entry; while (tsi.hasNext()) { entry = tsi.next(); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 785a05200..90878495f 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -135,47 +135,47 @@ public class ViewImage { // find original size int h = image.getHeight(null); int w = image.getWidth(null); - - // System.out.println("DEBUG: get access to image " + - // url.toNormalform() + " is " + ((auth) ? "authorized" : "NOT - // authorized")); - + // in case of not-authorized access shrink the image to prevent - // copyright problems - // so that images are not larger than thumbnails - if ((!auth) && ((w > 16) || (h > 16))) { + // copyright problems, so that images are not larger than thumbnails + if (auth) { + maxwidth = (maxwidth == 0) ? w : maxwidth; + maxheight = (maxheight == 0) ? h : maxheight; + } else if ((w > 16) || (h > 16)) { maxwidth = (int) Math.min(64.0, w * 0.6); maxheight = (int) Math.min(64.0, h * 0.6); + } else { + maxwidth = 16; + maxheight = 16; } // calculate width & height from maxwidth & maxheight - if ((maxwidth != 0) || (maxheight != 0)) { + if ((maxwidth < w) || (maxheight < h)) { + // scale image double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) / ((double) w); double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) / ((double) h); double scale = Math.min(hs, vs); if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose if (scale < 1.0) { - width = (int) (w * scale); - height = (int) (h * scale); + width = Math.max(1, (int) (w * scale)); + height = Math.max(1, (int) (h * scale)); } else { - width = w; - height = h; + width = Math.max(1, w); + height = Math.max(1, h); } + + // compute scaled image + scaled = ((w == width) && (h == height)) ? image : image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING); + MediaTracker mediaTracker = new MediaTracker(new Container()); + mediaTracker.addImage(scaled, 0); + try {mediaTracker.waitForID(0);} catch (InterruptedException e) {} } else { + // do not scale width = w; height = h; + scaled = image; } - // check for minimum values - width = Math.max(width, 1); - height = Math.max(height, 1); - - // scale image - scaled = ((w == width) && (h == height)) ? image : image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING); - MediaTracker mediaTracker = new MediaTracker(new Container()); - mediaTracker.addImage(scaled, 0); - try {mediaTracker.waitForID(0);} catch (InterruptedException e) {} - if ((height == 16) && (width == 16) && (resource != null)) { // this might be a favicon, store image to cache for faster re-load later on iconcache.put(urlString, scaled); diff --git a/htroot/yacysearchitem.html b/htroot/yacysearchitem.html index 5f14af734..6291da034 100644 --- a/htroot/yacysearchitem.html +++ b/htroot/yacysearchitem.html @@ -22,7 +22,7 @@ :: #{items}#
- + #[name]# diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 1537327c6..9af955ba5 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -67,6 +67,7 @@ public class yacysearchitem { boolean rss = post.get("rss", "false").equals("true"); boolean authenticated = sb.adminAuthenticated(header) >= 2; int item = post.getInt("item", -1); + boolean auth = ((String) header.get(httpHeader.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") || sb.verifyAuthentication(header, true); // default settings for blank item prop.put("content", "0"); @@ -233,6 +234,7 @@ public class yacysearchitem { if (ms == null) { prop.put("content_items", "0"); } else { + prop.putHTML("content_items_0_hrefCache", (auth) ? "/ViewImage.png?url=" + ms.href.toNormalform(true, false) : ms.href.toNormalform(true, false)); prop.putHTML("content_items_0_href", ms.href.toNormalform(true, false)); prop.put("content_items_0_code", sb.licensedURLs.aquireLicense(ms.href)); prop.putHTML("content_items_0_name", shorten(ms.name, namelength)); diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 82dbf7c4d..18e5b2956 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -54,6 +54,7 @@ import java.net.MalformedURLException; import java.text.Collator; import java.util.ArrayList; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; @@ -102,7 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen // class variables: collectors for links private HashMap anchors; - private TreeSet images; // String(absolute url)/ImageEntry relation + private HashMap images; // urlhash/image relation private HashMap metas; private String title; //private String headline; @@ -127,7 +128,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen super(linkTags0, linkTags1); this.root = root; this.anchors = new HashMap(); - this.images = new TreeSet(); + this.images = new HashMap(); this.metas = new HashMap(); this.title = ""; this.headlines = new ArrayList[4]; @@ -178,7 +179,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } catch (NumberFormatException e) {} yacyURL url = absolutePath(tagopts.getProperty("src", "")); htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height); - images.add(ie); + addImage(images, ie); } if (tagname.equalsIgnoreCase("base")) try { root = new yacyURL(tagopts.getProperty("href", ""), null); @@ -212,7 +213,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (type.equalsIgnoreCase("shortcut icon")) { htmlFilterImageEntry ie = new htmlFilterImageEntry(newLink, linktitle, -1,-1); - images.add(ie); + images.put(ie.url().hash(), ie); this.favicon = newLink; } else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) { anchors.put(newLink, linktitle); @@ -234,12 +235,24 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen // fire event fireScrapeTag0(tagname, tagopts); } - + public void scrapeTag1(String tagname, Properties tagopts, char[] text) { // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text)); if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) { String href = tagopts.getProperty("href", ""); - if (href.length() > 0) anchors.put(absolutePath(href), super.stripAll(new serverCharBuffer(text)).trim().toString()); + if (href.length() > 0) { + yacyURL url = absolutePath(href); + String f = url.getFile(); + int p = f.lastIndexOf('.'); + String type = (p < 0) ? "" : f.substring(p + 1); + if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg")) { + // special handling of such urls: put them to the image urls + htmlFilterImageEntry ie = new htmlFilterImageEntry(url, super.stripAll(new serverCharBuffer(text)).trim().toString(), -1, -1); + addImage(images, ie); + } else { + anchors.put(url, super.stripAll(new serverCharBuffer(text)).trim().toString()); + } + } } String h; if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { @@ -348,7 +361,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return anchors; } - public TreeSet getImages() { + public HashMap getImages() { // this resturns a String(absolute url)/htmlFilterImageEntry - relation return images; } @@ -522,5 +535,24 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return scraper; } + + public static void addAllImages(HashMap a, HashMap b) { + Iterator> i = b.entrySet().iterator(); + Map.Entry ie; + while (i.hasNext()) { + ie = i.next(); + addImage(a, ie.getValue()); + } + } + + public static void addImage(HashMap a, htmlFilterImageEntry ie) { + if (a.containsKey(ie.url().hash())) { + // in case of a collision, take that image that has the better image size tags + if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url().hash(), ie); + } else { + a.put(ie.url().hash(), ie); + } + } + } diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index ecdc1d14a..e06f1eed1 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -452,6 +452,7 @@ public final class httpdFileHandler { sb.append("\n\n\n\n

Index of " + path + "

\n
    \n"); File dir = new File(htDocsPath, path); String[] list = dir.list(); + if (list == null) list = new String[0]; // should not occur! File f; String size; long sz; diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index c14459135..4d26f8a8e 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -50,7 +50,6 @@ import java.util.HashMap; import java.util.Hashtable; import java.util.LinkedList; import java.util.Map; -import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterAbstractScraper; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -97,7 +96,7 @@ public class rssParser extends AbstractParser implements Parser { try { LinkedList feedSections = new LinkedList(); HashMap anchors = new HashMap(); - TreeSet images = new TreeSet(); + HashMap images = new HashMap(); serverByteBuffer text = new serverByteBuffer(); serverCharBuffer authors = new serverCharBuffer(); @@ -114,7 +113,8 @@ public class rssParser extends AbstractParser implements Parser { String feedDescription = reader.getChannel().getDescription(); if (reader.getImage() != null) { - images.add(new htmlFilterImageEntry(new yacyURL(reader.getImage(), null), feedTitle, -1, -1)); + yacyURL imgURL = new yacyURL(reader.getImage(), null); + images.put(imgURL.hash(), new htmlFilterImageEntry(imgURL, feedTitle, -1, -1)); } // loop through the feed items @@ -154,9 +154,9 @@ public class rssParser extends AbstractParser implements Parser { anchors.putAll(itemLinks); } - TreeSet itemImages = scraper.getImages(); + HashMap itemImages = scraper.getImages(); if ((itemImages != null) && (itemImages.size() > 0)) { - images.addAll(itemImages); + htmlFilterContentScraper.addAllImages(images, itemImages); } byte[] extractedText = scraper.getText(); diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index 93f7b36cd..0587f0cf5 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -53,12 +53,12 @@ import java.util.HashMap; import java.util.Hashtable; import java.util.LinkedList; import java.util.Map; -import java.util.TreeSet; import java.util.zip.GZIPInputStream; import com.ice.tar.TarEntry; import com.ice.tar.TarInputStream; +import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParserDocument; @@ -132,7 +132,7 @@ public class tarParser extends AbstractParser implements Parser { StringBuffer docAbstrct = new StringBuffer(); Map docAnchors = new HashMap(); - TreeSet docImages = new TreeSet(); + HashMap docImages = new HashMap(); // looping through the contained files TarEntry entry; @@ -193,7 +193,7 @@ public class tarParser extends AbstractParser implements Parser { } docAnchors.putAll(subDoc.getAnchors()); - docImages.addAll(subDoc.getImages()); + htmlFilterContentScraper.addAllImages(docImages, subDoc.getImages()); // release subdocument subDoc.close(); diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index 2b48249ed..e73c615d5 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -53,10 +53,10 @@ import java.util.HashMap; import java.util.Hashtable; import java.util.LinkedList; import java.util.Map; -import java.util.TreeSet; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParserDocument; @@ -115,7 +115,7 @@ public class zipParser extends AbstractParser implements Parser { LinkedList docSections = new LinkedList(); StringBuffer docAbstrct = new StringBuffer(); Map docAnchors = new HashMap(); - TreeSet docImages = new TreeSet(); + HashMap docImages = new HashMap(); // creating a new parser class to parse the unzipped content plasmaParser theParser = new plasmaParser(); @@ -176,7 +176,7 @@ public class zipParser extends AbstractParser implements Parser { } docAnchors.putAll(subDoc.getAnchors()); - docImages.addAll(subDoc.getImages()); + htmlFilterContentScraper.addAllImages(docImages, subDoc.getImages()); // release subdocument subDoc.close(); diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 01d0edb6f..4c2bc8377 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -107,6 +107,19 @@ public final class plasmaCondenser { private final static int numlength = 5; + // initialize array of invisible characters + private static boolean[] invisibleChar = new boolean['z' - ' ' + 1]; + static { + // initialize array of invisible charachters + String invisibleString = "\"$%&/()=`^+*#'-_:;,<>[]\\"; + for (int i = ' '; i <= 'z'; i++) { + invisibleChar[i - ' '] = false; + } + for (int i = 0; i < invisibleString.length(); i++) { + invisibleChar[invisibleString.charAt(i) - ' '] = true; + } + } + //private Properties analysis; private TreeMap words; // a string (the words) to (wordStatProp) - relation private HashMap sentences; @@ -198,7 +211,7 @@ public final class plasmaCondenser { } // images - Iterator j = document.getImages().iterator(); + Iterator j = document.getImages().values().iterator(); htmlFilterImageEntry ientry; while (j.hasNext()) { ientry = j.next(); @@ -659,7 +672,7 @@ public final class plasmaCondenser { public final static boolean invisible(char c) { // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars? if ((c < ' ') || (c > 'z')) return true; - return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0); + return invisibleChar[c - ' ']; } public static Enumeration wordTokenizer(String s, String charset, int minLength) { @@ -727,7 +740,7 @@ public final class plasmaCondenser { public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException { e = new sentencesFromInputStreamEnum(is, charset); - s = new StringBuffer(); + s = new StringBuffer(20); buffer = nextElement0(); } @@ -859,9 +872,9 @@ public final class plasmaCondenser { } static StringBuffer readSentence(Reader reader, boolean pre) throws IOException { - StringBuffer s = new StringBuffer(); + StringBuffer s = new StringBuffer(20); int nextChar; - char c; + char c, lc = (char) 0; // find sentence end for (;;) { @@ -871,20 +884,14 @@ public final class plasmaCondenser { if (s.length() == 0) return null; else break; } c = (char) nextChar; + if (pre && ((c == (char) 10) || (c == (char) 13))) break; + if ((c == (char) 8) || (c == (char) 10) || (c == (char) 13)) c = ' '; + if ((lc == ' ') && (c == ' ')) continue; // ignore double spaces s.append(c); - if (pre) { - if ((c == (char) 10) || (c == (char) 13)) break; - } else { - if (htmlFilterContentScraper.punctuation(c)) break; - } - } - - // replace line endings and tabs by blanks - for (int i = 0; i < s.length(); i++) { - if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' '); + if (htmlFilterContentScraper.punctuation(c)) break; + lc = c; } - // remove all double-spaces - int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p); + return s; } diff --git a/source/de/anomic/plasma/plasmaCrawlBalancer.java b/source/de/anomic/plasma/plasmaCrawlBalancer.java index 960765c53..75e32bcba 100644 --- a/source/de/anomic/plasma/plasmaCrawlBalancer.java +++ b/source/de/anomic/plasma/plasmaCrawlBalancer.java @@ -130,7 +130,10 @@ public class plasmaCrawlBalancer { } public void finalize() { - if (urlFileStack != null) close(); + if (urlFileStack != null) { + serverLog.logWarning("plasmaCrawlBalancer", "crawl stack " + stackname + " closed by finalizer"); + close(); + } } public synchronized void clear() { diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index faa53c931..244a8d1f9 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -49,6 +49,8 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; +import de.anomic.server.logging.serverLog; + public class plasmaCrawlNURL { public static final int STACK_TYPE_NULL = 0; // do not stack @@ -64,9 +66,9 @@ public class plasmaCrawlNURL { private static final long minimumGlobalDelta = 500; // the minimum time difference between access of the same global domain private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt - private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1 - private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth - private final plasmaCrawlBalancer remoteStack; // links from remote crawl orders + private plasmaCrawlBalancer coreStack; // links found by crawling to depth-1 + private plasmaCrawlBalancer limitStack; // links found by crawling at target depth + private plasmaCrawlBalancer remoteStack; // links from remote crawl orders //private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1 //private kelondroStack imageStack; // links pointing to image resources //private kelondroStack movieStack; // links pointing to movie resources @@ -81,10 +83,26 @@ public class plasmaCrawlNURL { } public void close() { - coreStack.close(); - limitStack.close(); + if (coreStack != null) { + coreStack.close(); + coreStack = null; + } + if (limitStack != null) { + limitStack.close(); + limitStack = null; + } //overhangStack.close(); - remoteStack.close(); + if (remoteStack != null) { + remoteStack.close(); + remoteStack = null; + } + } + + public void finalize() { + if ((coreStack != null) || (limitStack != null) || (remoteStack != null)) { + serverLog.logWarning("plasmaCrawlNURL", "NURL stack closed by finalizer"); + close(); + } } public boolean notEmpty() { diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index a7001118e..0344ae68b 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -39,6 +39,7 @@ import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; import java.util.Arrays; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Hashtable; @@ -747,7 +748,7 @@ public final class plasmaParser { } - static Map allReflinks(Set links) { + static Map allReflinks(Collection links) { // links is either a Set of Strings (with urls) or htmlFilterImageEntries // we find all links that are part of a reference inside a url HashMap v = new HashMap(); @@ -786,7 +787,7 @@ public final class plasmaParser { return v; } - static Map allSubpaths(Set links) { + static Map allSubpaths(Collection links) { // links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries HashSet h = new HashSet(); Iterator i = links.iterator(); diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 014eedefc..e9c87581b 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -61,6 +61,7 @@ import java.util.List; import java.util.Map; import java.util.TreeSet; +import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.plasma.parser.Parser; @@ -76,7 +77,7 @@ public class plasmaParserDocument { private StringBuffer description; // an abstract, if present: short content description private Object text; // the clear text, all that is visible private Map anchors; // all links embedded as clickeable entities (anchor tags) - private TreeSet images; // all visible pictures in document + private HashMap images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. @@ -89,7 +90,7 @@ public class plasmaParserDocument { protected plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - Object text, Map anchors, TreeSet images) { + Object text, Map anchors, HashMap images) { this.source = location; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.charset = charset; @@ -99,7 +100,7 @@ public class plasmaParserDocument { this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections); this.description = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct); this.anchors = (anchors == null) ? new HashMap(0) : anchors; - this.images = (images == null) ? new TreeSet() : images; + this.images = (images == null) ? new HashMap() : images; this.hyperlinks = null; this.audiolinks = null; this.videolinks = null; @@ -124,21 +125,21 @@ public class plasmaParserDocument { public plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - byte[] text, Map anchors, TreeSet images) { + byte[] text, Map anchors, HashMap images) { this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } public plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - File text, Map anchors, TreeSet images) { + File text, Map anchors, HashMap images) { this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } public plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - serverCachedFileOutputStream text, Map anchors, TreeSet images) { + serverCachedFileOutputStream text, Map anchors, HashMap images) { this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } @@ -310,7 +311,7 @@ dc_rights return this.videolinks; } - public TreeSet getImages() { + public HashMap getImages() { // returns all links enbedded as pictures (visible in document) // this resturns a htmlFilterImageEntry collection if (!resorted) resortLinks(); @@ -341,7 +342,7 @@ dc_rights audiolinks = new HashMap(); applinks = new HashMap(); emaillinks = new HashMap(); - TreeSet collectedImages = new TreeSet(); // this is a set that is collected now and joined later to the imagelinks + HashMap collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks Map.Entry entry; while (i.hasNext()) { entry = i.next(); @@ -361,7 +362,7 @@ dc_rights if (plasmaParser.mediaExtContains(ext)) { // this is not a normal anchor, its a media link if (plasmaParser.imageExtContains(ext)) { - collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1)); + htmlFilterContentScraper.addImage(collectedImages, new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1)); } else if (plasmaParser.audioExtContains(ext)) audiolinks.put(url, (String)entry.getValue()); else if (plasmaParser.videoExtContains(ext)) videolinks.put(url, (String)entry.getValue()); @@ -374,23 +375,18 @@ dc_rights } // add image links that we collected from the anchors to the image map - Iterator j = collectedImages.iterator(); - htmlFilterImageEntry iEntry; - while (j.hasNext()) { - iEntry = (htmlFilterImageEntry) j.next(); - if (!images.contains(iEntry)) images.add(iEntry); - } + htmlFilterContentScraper.addAllImages(images, collectedImages); // expand the hyperlinks: // we add artificial hyperlinks to the hyperlink set // that can be calculated from given hyperlinks and imagelinks - hyperlinks.putAll(plasmaParser.allReflinks(images)); + hyperlinks.putAll(plasmaParser.allReflinks(images.values())); hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet())); hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet())); hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet())); hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet())); - hyperlinks.putAll(plasmaParser.allSubpaths(images)); + hyperlinks.putAll(plasmaParser.allSubpaths(images.values())); hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet())); hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet())); hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet())); @@ -417,7 +413,7 @@ dc_rights serverFileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text); anchors.putAll(doc.getAnchors()); - images.addAll(doc.getImages()); + htmlFilterContentScraper.addAllImages(images, doc.getImages()); } /** diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java index 90ae0ec01..aabaf4cc8 100644 --- a/source/de/anomic/plasma/plasmaSearchImages.java +++ b/source/de/anomic/plasma/plasmaSearchImages.java @@ -43,9 +43,10 @@ package de.anomic.plasma; import java.io.InputStream; import java.net.MalformedURLException; +import java.util.HashMap; import java.util.Iterator; -import java.util.TreeSet; +import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.plasma.parser.ParserException; import de.anomic.server.serverDate; @@ -53,11 +54,11 @@ import de.anomic.yacy.yacyURL; public final class plasmaSearchImages { - private TreeSet images; + private HashMap images; public plasmaSearchImages(long maxTime, yacyURL url, int depth) { long start = System.currentTimeMillis(); - this.images = new TreeSet(); + this.images = new HashMap(); if (maxTime > 10) { Object[] resource = plasmaSnippetCache.getResource(url, true, (int) maxTime, false); InputStream res = (InputStream) resource[0]; @@ -75,7 +76,7 @@ public final class plasmaSearchImages { if (document == null) return; // add the image links - this.addAll(document.getImages()); + htmlFilterContentScraper.addAllImages(this.images, document.getImages()); // add also links from pages one step deeper, if depth > 0 if (depth > 0) { @@ -97,26 +98,13 @@ public final class plasmaSearchImages { public void addAll(plasmaSearchImages m) { synchronized (m.images) { - addAll(m.images); - } - } - - private void addAll(TreeSet ts) { - Iterator i = ts.iterator(); - htmlFilterImageEntry ie; - while (i.hasNext()) { - ie = i.next(); - if (images.contains(ie)) { - if ((ie.height() > 0) && (ie.width() > 0)) images.add(ie); - } else { - images.add(ie); - } + htmlFilterContentScraper.addAllImages(this.images, m.images); } } public Iterator entries() { // returns htmlFilterImageEntry - Objects - return images.iterator(); + return images.values().iterator(); } } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 42accd875..3b336ee69 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -697,7 +697,8 @@ public class plasmaSnippetCache { public static ArrayList computeImageSnippets(plasmaParserDocument document, Set queryhashes) { - TreeSet images = document.getImages(); // iterates images in descending size order! + TreeSet images = new TreeSet(); + images.addAll(document.getImages().values()); // iterates images in descending size order! // a measurement for the size of the images can be retrieved using the htmlFilterImageEntry.hashCode() Iterator i = images.iterator();