redesigned some parts of the html scanner & parser

to better support image tags

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1995 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent ac114d69c0
commit 83e0e765ec

@ -66,9 +66,10 @@ public class ViewImage {
}
int width = post.getInt("width", 0);
int height = post.getInt("height", 0);
int timeout = post.getInt("timeout", 5000);
// load image
byte[] imgb = sb.snippetCache.getResource(url, true);
byte[] imgb = sb.snippetCache.getResource(url, true, timeout);
if (imgb == null) return null;
// create image
@ -76,7 +77,20 @@ public class ViewImage {
Image original = Toolkit.getDefaultToolkit().createImage(imgb);
mediaTracker.addImage(original, 0);
try {mediaTracker.waitForID(0);} catch (InterruptedException e) {}
if ((width == 0) || (height == 0)) return original;
boolean auth = ((String) header.get("CLIENTIP", "")).equals("localhost") || sb.verifyAuthentication(header, false); // handle access rights
if ((auth) && ((width == 0) || (height == 0))) return original;
// in case of not-authorized access shrink the image to prevent copyright problems
// so that images are not larger than thumbnails
if (!auth) {
width = width / 2;
height = height / 2;
int xsc = Math.max(width, height);
if (xsc > 64) {
width = width * 64 / xsc;
height = height * 64 / xsc;
}
}
// scale image
Image scaled = original.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);

@ -89,7 +89,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// class variables: collectors for links
private HashMap anchors;
private HashMap images;
private TreeSet images; // String(absolute url)/ImageEntry relation
private HashMap metas;
private String title;
//private String headline;
@ -103,7 +103,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
super(linkTags0, linkTags1);
this.root = root;
this.anchors = new HashMap();
this.images = new HashMap();
this.images = new TreeSet();
this.metas = new HashMap();
this.title = "";
this.headlines = new ArrayList[4];
@ -112,55 +112,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
public void scrapeText(byte[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext));
// System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);
content.append(super.stripAll(new serverByteBuffer(newtext, newtext.length + 1)).trim()).append(32);
}
/*
public static String urlNormalform(String us) {
if (us == null) { return null; }
if (us.length() == 0) { return null; }
serverLog.logFiner("htmlFilter", "urlNormalform: IN=" + us);
// TODO: what about
// - case insensitive domain names
// - chars that should be escaped in URLs
// cutting of everything behind #
int cpos = us.indexOf("#");
if (cpos >= 0) { us = us.substring(0, cpos); }
if (us.startsWith("https")) {
if (us.endsWith(":443")) {
us = us.substring(0, us.length() - 4);
serverLog.logFinest("htmlFilter", "urlNormalform: :443=" + us);
} else {
cpos = us.indexOf(":443/");
if (cpos >= 0) {
us = us.substring(0, cpos).concat(us.substring(cpos + 4));
serverLog.logFinest("htmlFilter", "urlNormalform: :443/=" + us);
}
}
} else if (us.startsWith("http")) {
if (us.endsWith(":80")) {
us = us.substring(0, us.length() - 3);
serverLog.logFinest("htmlFilter", "urlNormalform: :80=" + us);
} else {
cpos = us.indexOf(":80/");
if (cpos >= 0) {
us = us.substring(0, cpos).concat(us.substring(cpos + 3));
serverLog.logFinest("htmlFilter", "urlNormalform: :80/=" + us);
}
}
}
if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
serverLog.logFine("htmlFilter", "urlNormalform: OUT=" + us);
return us;
}
*/
public static String urlNormalform(URL url) {
boolean defaultPort = false;
// serverLog.logFinest("htmlFilter", "urlNormalform: '" + url.toString() + "'");
@ -212,7 +168,18 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
public void scrapeTag0(String tagname, Properties tagopts) {
if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
if (tagname.equalsIgnoreCase("img")) {
int width = -1, height = -1;
try {
width = Integer.parseInt(tagopts.getProperty("width", "-1"));
height = Integer.parseInt(tagopts.getProperty("height", "-1"));
} catch (NumberFormatException e) {}
try {
URL url = new URL(absolutePath(tagopts.getProperty("src", "")));
htmlFilterImageEntry ie = new htmlFilterImageEntry(url, tagopts.getProperty("alt",""), width, height);
images.add(ie);
} catch (MalformedURLException e) {}
}
if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
if (tagname.equalsIgnoreCase("meta")) {
@ -230,7 +197,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
if ((tagname.equalsIgnoreCase("a")) && (text.length < 2048)) anchors.put(absolutePath(tagopts.getProperty("href", "")), super.stripAll(new serverByteBuffer(text)).trim().toString());
String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
@ -303,7 +270,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return anchors;
}
public Map getImages() {
public TreeSet getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
return images;
}
@ -389,7 +357,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
System.out.println("METAS :" + metas.toString());
System.out.println("TEXT :" + new String(content.getBytes()));
}
/*
public static void main(String[] args) {
try {

@ -0,0 +1,101 @@
// htmlFilterImageEntry.java
// -----------------------------
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
// created 04.04.2006
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.htmlFilter;
import java.net.URL;
public class htmlFilterImageEntry {
private URL url;
private String alt;
private int width, height;
public htmlFilterImageEntry(URL url, String alt, int width, int height) {
this.url = url;
this.alt = alt;
this.width = width;
this.height = height;
}
public URL url() {
return this.url;
}
public String alt() {
return this.alt;
}
public int width() {
return this.width;
}
public int height() {
return this.height;
}
public String toString() {
return "{" + alt + ", " + width + "/" + height + "}";
}
public int hashCode() {
if ((width > 0) && (height > 0))
return ((0xFFFF - (((width * height) >> 8) & 0xFFFF)) << 16) | (url.hashCode() & 0xFFFF);
else
return 0xFFFF0000 | (url.hashCode() & 0xFFFF);
}
public int compareTo(Object h) {
// this is needed if this object is stored in a TreeSet
assert (url != null);
assert (h instanceof htmlFilterImageEntry);
if (this.url.equals(((htmlFilterImageEntry) h).url)) return 0;
int thc = this.hashCode();
int ohc = ((htmlFilterImageEntry) h).hashCode();
if (thc < ohc) return -1;
if (thc > ohc) return 1;
return 0;
}
public boolean equals(Object o) {
if (!(o instanceof htmlFilterImageEntry)) return false;
return compareTo(o) == 0;
}
}

@ -53,9 +53,11 @@ import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterAbstractScraper;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.AbstractParser;
@ -104,7 +106,7 @@ public class rssParser extends AbstractParser implements Parser {
try {
LinkedList feedSections = new LinkedList();
HashMap anchors = new HashMap();
HashMap images = new HashMap();
TreeSet images = new TreeSet();
serverByteBuffer text = new serverByteBuffer();
@ -125,7 +127,7 @@ public class rssParser extends AbstractParser implements Parser {
ImageIF channelImage = channel.getImage();
if (channelImage != null) {
images.put(channelImage.getLocation().toString(),channelImage.getTitle());
images.add(new htmlFilterImageEntry(channelImage.getLocation(), channelImage.getTitle(), -1, -1));
}
// loop through the feed items
@ -162,9 +164,9 @@ public class rssParser extends AbstractParser implements Parser {
anchors.putAll(itemLinks);
}
Map itemImages = scraper.getImages();
TreeSet itemImages = scraper.getImages();
if ((itemImages != null) && (itemImages.size() > 0)) {
images.putAll(itemImages);
images.addAll(itemImages);
}
byte[] extractedText = scraper.getText();

@ -51,6 +51,7 @@ import java.util.HashMap;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeSet;
import java.util.zip.GZIPInputStream;
import com.ice.tar.TarEntry;
@ -115,7 +116,7 @@ public class tarParser extends AbstractParser implements Parser {
StringBuffer docAbstrct = new StringBuffer();
serverByteBuffer docText = new serverByteBuffer();
Map docAnchors = new HashMap();
Map docImages = new HashMap();
TreeSet docImages = new TreeSet();
// looping through the contained files
TarEntry entry;
@ -174,7 +175,7 @@ public class tarParser extends AbstractParser implements Parser {
docText.append(theDoc.getText());
docAnchors.putAll(theDoc.getAnchors());
docImages.putAll(theDoc.getImages());
docImages.addAll(theDoc.getImages());
}
/* (URL location, String mimeType,

@ -51,6 +51,7 @@ import java.util.HashMap;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
@ -100,8 +101,7 @@ public class zipParser extends AbstractParser implements Parser {
StringBuffer docAbstrct = new StringBuffer();
serverByteBuffer docText = new serverByteBuffer();
Map docAnchors = new HashMap();
Map docImages = new HashMap();
TreeSet docImages = new TreeSet();
// creating a new parser class to parse the unzipped content
plasmaParser theParser = new plasmaParser();
@ -151,7 +151,7 @@ public class zipParser extends AbstractParser implements Parser {
docText.append(theDoc.getText());
docAnchors.putAll(theDoc.getAnchors());
docImages.putAll(theDoc.getImages());
docImages.addAll(theDoc.getImages());
}
/* (URL location, String mimeType,

@ -418,10 +418,11 @@ public final class plasmaCrawlLURL extends plasmaURL {
// - phrasecount, total number of phrases
// - boolean: URL attributes (see Word-Entity definition)
// - boolean: appearance of bold and/or italics
// - ETag: for re-crawl decision upon HEAD request
// - int: # of outlinks to same domain
// - int: # of outlinks to outside domain
// - ETag: for re-crawl decision upon HEAD request
// - int: # of keywords
// - int: # der auf der Seite vorhandenen Links zu image, audio, video, applications
public Entry(URL url, String descr, Date moddate, Date loaddate, String referrerHash, int copyCount, boolean localNeed, int quality, String language, char doctype, int size, int wordCount) {
// create new entry and store it into database

@ -247,6 +247,7 @@ public final class plasmaCrawlWorker extends Thread {
}
}
public static plasmaHTCache.Entry load(
URL url,
String name,

@ -132,7 +132,12 @@ public final class plasmaParser {
/**
* A list of media extensions that should <b>not</b> be handled by the plasmaParser
*/
private static final HashSet mediaExtSet = new HashSet(28);
private static final HashSet mediaExtSet = new HashSet();
/**
* A list of image extensions that should be handleable by image viewer apps
*/
private static final HashSet imageExtSet = new HashSet();
/**
* This {@link FilenameFilter} is used to find all classes based on there filenames
@ -160,8 +165,17 @@ public final class plasmaParser {
* @see #initMediaExt(String)
*/
static {
initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
"sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"));
initMediaExt(extString2extList(
"sit,hqx,img,dmg,exe,com,bat,sh" + // application container
"tar,gz,bz2,arj,zip,rar," + // archive formats
"ps,xls,ppt,asf," + // text formats without support
"mp3,ogg,aac," + // audio formats
"swf,avi,wmv,rm,mov,mpg,mpeg,ram," + // video formats
"jpg,jpeg,jpe,gif,png" // image formats
));
initImageExt(extString2extList(
"jpg,jpeg,jpe,gif,png" // image formats
));
/* ===================================================
* initializing the parser object pool
@ -225,8 +239,6 @@ public final class plasmaParser {
}
}
public static List extString2extList(String extString) {
LinkedList extensions = new LinkedList();
if ((extString == null) || (extString.length() == 0)) {
@ -245,6 +257,13 @@ public final class plasmaParser {
}
}
public static void initImageExt(List imageExtList) {
synchronized (imageExtSet) {
imageExtSet.clear();
imageExtSet.addAll(imageExtList);
}
}
public static String getMediaExtList() {
synchronized (mediaExtSet) {
return mediaExtSet.toString();
@ -315,6 +334,13 @@ public final class plasmaParser {
}
}
public static boolean imageExtContains(String imageExt) {
if (imageExt == null) return false;
synchronized (imageExtSet) {
return imageExtSet.contains(imageExt.trim().toLowerCase());
}
}
public static String getRealMimeType(String mimeType) {
//if (mimeType == null) doMimeTypeAnalysis
if (mimeType == null) mimeType = "application/octet-stream";

@ -43,12 +43,15 @@
package de.anomic.plasma;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import java.io.ByteArrayInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
public class plasmaParserDocument {
@ -61,7 +64,7 @@ public class plasmaParserDocument {
String abstrct; // an abstract, if present: short content description
byte[] text; // the clear text, all that is visible
Map anchors; // all links embedded as clickeable entities (anchor tags)
Map images; // all visible pictures in document
TreeSet images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
@ -69,11 +72,12 @@ public class plasmaParserDocument {
Map medialinks;
Map emaillinks;
plasmaCondenser condenser;
boolean resorted;
public plasmaParserDocument(URL location, String mimeType,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, Map images) {
byte[] text, Map anchors, TreeSet images) {
this.location = location;
this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
this.keywords = (keywords==null)?"":keywords;
@ -83,23 +87,14 @@ public class plasmaParserDocument {
this.abstrct = (abstrct==null)?"":abstrct;
this.text = (text==null)?new byte[0]:text;
this.anchors = (anchors==null)?new HashMap(0):anchors;
this.images = (images==null)?new HashMap(0):images;
this.images = (images==null)?new TreeSet():images;
this.hyperlinks = null;
this.medialinks = null;
this.emaillinks = null;
this.condenser = null;
this.resorted = false;
}
/*
private String absolutePath(String relativePath) {
try {
return htmlFilterContentScraper.urlNormalform(location, relativePath);
} catch (Exception e) {
return "";
}
}
*/
public String getMimeType() {
return this.mimeType;
}
@ -143,8 +138,10 @@ public class plasmaParserDocument {
return anchors;
}
public Map getImages() {
public TreeSet getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
return images;
}
@ -152,23 +149,25 @@ public class plasmaParserDocument {
public Map getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (hyperlinks == null) resortLinks();
if (!resorted) resortLinks();
return hyperlinks;
}
public Map getMedialinks() {
// this is partly subset of getAnchor and getImage: all non-hyperrefs
if (medialinks == null) resortLinks();
if (!resorted) resortLinks();
return medialinks;
}
public Map getEmaillinks() {
// this is part of the getAnchor-set: only links to email addresses
if (emaillinks == null) resortLinks();
if (!resorted) resortLinks();
return emaillinks;
}
private synchronized void resortLinks() {
// extract hyperlinks, medialinks and emaillinks from anchorlinks
Iterator i;
String url;
int extpos, qpos;
@ -177,6 +176,7 @@ public class plasmaParserDocument {
hyperlinks = new HashMap();
medialinks = new HashMap();
emaillinks = new HashMap();
TreeSet collectedImages = new TreeSet(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
@ -190,42 +190,54 @@ public class plasmaParserDocument {
if (((qpos = url.indexOf("?")) >= 0) && (qpos > extpos)) {
ext = url.substring(extpos, qpos).toLowerCase();
} else {
ext = url.substring(extpos).toLowerCase();
ext = url.substring(extpos).toLowerCase();
}
normal = htmlFilterContentScraper.urlNormalform(null, url);
if (normal != null) { //TODO: extension function is not correct
if (plasmaParser.mediaExtContains(ext.substring(1))) {
// this is not an normal anchor, its a media link
// this is not a normal anchor, its a media link
medialinks.put(normal, entry.getValue());
} else {
hyperlinks.put(normal, entry.getValue());
}
if (plasmaParser.imageExtContains(ext.substring(1))) {
try {
collectedImages.add(new htmlFilterImageEntry(new URL(normal), "", -1, -1));
} catch (MalformedURLException e) {}
}
}
}
}
}
// finally add the images to the medialinks
i = images.entrySet().iterator();
// add the images to the medialinks
i = images.iterator();
String normal;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
url = (String) entry.getKey();
normal = htmlFilterContentScraper.urlNormalform(null, url);
if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
if (normal != null) medialinks.put(normal, ((htmlFilterImageEntry) entry.getValue()).alt()); // avoid NullPointerException
}
expandHyperlinks();
}
public synchronized void expandHyperlinks() {
// we add artificial hyperlinks to the hyperlink set that can be calculated from
// given hyperlinks and imagelinks
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));
hyperlinks.putAll(plasmaParser.allReflinks(medialinks));
hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));
hyperlinks.putAll(plasmaParser.allSubpaths(medialinks));
// finally add image links that we collected from the anchors to the image map
i = collectedImages.iterator();
htmlFilterImageEntry iEntry;
while (i.hasNext()) {
iEntry = (htmlFilterImageEntry) i.next();
if (!images.contains(iEntry)) images.add(iEntry);
}
// don't do this again
this.resorted = true;
}
}

@ -0,0 +1,119 @@
// plasmaSearchImages.java
// -----------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
// Created: 04.04.2006
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.server.serverDate;
public final class plasmaSearchImages {
private TreeSet images;
public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, URL url, int depth) {
long start = System.currentTimeMillis();
this.images = new TreeSet();
if (maxTime > 10) {
byte[] res = sc.getResource(url, true, (int) maxTime);
if (res != null) {
plasmaParserDocument document = sc.parseDocument(url, res);
// add the image links
this.addAll(document.getImages());
// add also links from pages one step deeper, if depth > 0
if (depth > 0) {
Map hl = document.getHyperlinks();
Iterator i = hl.entrySet().iterator();
while (i.hasNext()) {
Map.Entry e = (Map.Entry) i.next();
String nexturlstring = htmlFilterContentScraper.urlNormalform(null, (String) e.getKey());
try {
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), new URL(nexturlstring), depth - 1));
} catch (MalformedURLException e2) {}
}
}
}
}
}
public plasmaSearchImages(plasmaSnippetCache sc, long maxTime, plasmaSearchResult sres, int depth) {
long start = System.currentTimeMillis();
this.images = new TreeSet();
plasmaCrawlLURL.Entry urlentry;
while (sres.hasMoreElements()) {
urlentry = sres.nextElement();
addAll(new plasmaSearchImages(sc, serverDate.remainingTime(start, maxTime, 10), urlentry.url(), depth));
}
}
public void addAll(plasmaSearchImages m) {
synchronized (m.images) {
addAll(m.images);
}
}
private void addAll(TreeSet ts) {
Iterator i = ts.iterator();
htmlFilterImageEntry ie;
while (i.hasNext()) {
ie = (htmlFilterImageEntry) i.next();
if (images.contains(ie)) {
if ((ie.height() > 0) && (ie.width() > 0)) images.add(ie);
} else {
images.add(ie);
}
}
}
public Iterator entries() {
// returns htmlFilterImageEntry - Objects
return images.iterator();
}
}

@ -1,184 +0,0 @@
// plasmaSearchMedia.java
// -----------------------
// part of YACY
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2006
// Created: 03.04.2006
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
public final class plasmaSearchMedia {
private HashSet ext;
private TreeSet media;
public plasmaSearchMedia(plasmaSnippetCache sc, String exts, URL url, int depth) {
this(sc, extGen(exts), url, depth);
}
public plasmaSearchMedia(plasmaSnippetCache sc, HashSet exts, URL url, int depth) {
this.ext = exts;
this.media = new TreeSet();
byte[] res = sc.getResource(url, true);
if (res != null) {
plasmaParserDocument document = sc.parseDocument(url, res);
// add the media links
Map ml = document.getMedialinks();
Iterator i = ml.entrySet().iterator();
while (i.hasNext()) {
Map.Entry e = (Map.Entry) i.next();
String nexturlstring = htmlFilterContentScraper.urlNormalform(null, (String) e.getKey());
int p = nexturlstring.lastIndexOf(".");
if ((p > 0) && (this.ext.contains(nexturlstring.substring(p + 1)))) {
try {
media.add(new Entry(new URL(nexturlstring), 0));
} catch (MalformedURLException e1) {}
}
}
// add also links from pages one step deeper, if depth > 0
if (depth > 0) {
Map hl = document.getHyperlinks();
i = hl.entrySet().iterator();
while (i.hasNext()) {
Map.Entry e = (Map.Entry) i.next();
String nexturlstring = htmlFilterContentScraper.urlNormalform(null, (String) e.getKey());
try {
addAll(new plasmaSearchMedia(sc, ext, new URL(nexturlstring), depth - 1));
} catch (MalformedURLException e2) {}
}
}
}
}
public plasmaSearchMedia(plasmaSnippetCache sc, String exts, plasmaSearchResult sres) {
this(sc, extGen(exts), sres);
}
public plasmaSearchMedia(plasmaSnippetCache sc, HashSet exts, plasmaSearchResult sres) {
this.ext = exts;
this.media = new TreeSet();
plasmaCrawlLURL.Entry urlentry;
while (sres.hasMoreElements()) {
urlentry = sres.nextElement();
addAll(new plasmaSearchMedia(sc, ext, urlentry.url(), 0));
}
}
private static HashSet extGen(String ext) {
ext.replaceAll(",", " ");
String[] exts = ext.split(" ");
HashSet s = new HashSet(exts.length);
for (int i = 0; i < exts.length; i++) s.add(exts[i]);
return s;
}
public void addAll(plasmaSearchMedia m) {
this.media.addAll(m.media);
}
public Iterator entries() {
// returns Entry-Objects
return media.iterator();
}
public class Entry {
private URL url;
private int size, width, height;
public Entry(URL url, int size) {
this.url = url;
this.size = size;
this.width = -1;
this.height = -1;
}
public Entry(URL url, int width, int height) {
this.url = url;
this.size = -1;
this.width = width;
this.height = height;
}
public URL url() {
return this.url;
}
public int size() {
return this.size;
}
public int width() {
return this.width;
}
public int height() {
return this.height;
}
public int hashCode() {
if ((width > 0) && (height > 0))
return (((width * height) >> 8) << 16) | (url.hashCode() & 0xFFFF);
else
return ((size >> 8) << 16) | (url.hashCode() & 0xFFFF);
}
public int compareTo(Object h) {
// this is needed if this object is stored in a TreeSet
assert (url != null);
assert (h instanceof plasmaSearchMedia.Entry);
int thc = this.hashCode();
int ohc = ((plasmaSearchMedia.Entry) h).hashCode();
if (thc < ohc) return -1;
if (thc > ohc) return 1;
return 0;
}
}
}

@ -398,13 +398,12 @@ public class plasmaSnippetCache {
}
}
public byte[] getResource(URL url, boolean fetchOnline) {
public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
// load the url as resource from the web
try {
//return httpc.singleGET(url, 5000, null, null, remoteProxyHost, remoteProxyPort);
byte[] resource = cacheManager.loadResource(url);
if ((fetchOnline) && (resource == null)) {
loadResourceFromWeb(url, 5000);
loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout);
resource = cacheManager.loadResource(url);
}
return resource;

@ -1958,7 +1958,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// get set of words
// Set words = plasmaCondenser.getWords(getText(getResource(url,
// fetchOnline)));
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
// delete all word references
int count = removeReferences(urlhash, witer);
// finally delete the url entry itself

@ -299,7 +299,13 @@ public final class serverDate {
return "unknown";
}
}
public static long remainingTime(long start, long due, long minimum) {
if (due < 0) return -1;
long r = due + start - System.currentTimeMillis();
if (r <= 0) return minimum; else return r;
}
public static void main(String[] args) {
//System.out.println("kelondroDate is (" + new kelondroDate().toString() + ")");
System.out.println("offset is " + (UTCDiff()/1000/60/60) + " hours, javaDate is " + new Date() + ", correctedDate is " + new Date(correctedUTCTime()));

Loading…
Cancel
Save