*) cleaning up the code a little bit

*) minor changes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7396 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
low012 14 years ago
parent 2a6499364d
commit 3d95981f7d

@ -5,7 +5,9 @@
//first published on http://www.anomic.de //first published on http://www.anomic.de
//Frankfurt, Germany, 2004 //Frankfurt, Germany, 2004
//last major change: 12.07.2004 // $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//This program is free software; you can redistribute it and/or modify //This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by //it under the terms of the GNU General Public License as published by
@ -31,7 +33,6 @@ import java.net.MalformedURLException;
import java.net.URLDecoder; import java.net.URLDecoder;
import java.util.Collection; import java.util.Collection;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
@ -85,7 +86,6 @@ public class ViewFile {
return prop; return prop;
} }
final int display = post.getInt("display", 1); final int display = post.getInt("display", 1);
// get segment // get segment
@ -209,7 +209,7 @@ public class ViewFile {
} else if (viewMode.equals("iframeCache")) { } else if (viewMode.equals("iframeCache")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CACHE); prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CACHE);
String ext = url.getFileExtension(); final String ext = url.getFileExtension();
if ("jpg.jpeg.png.gif".indexOf(ext) >= 0) { if ("jpg.jpeg.png.gif".indexOf(ext) >= 0) {
prop.put("viewMode_png", 1); prop.put("viewMode_png", 1);
prop.put("viewMode_png_url", url.toNormalform(false, true)); prop.put("viewMode_png_url", url.toNormalform(false, true));
@ -259,7 +259,7 @@ public class ViewFile {
if (sentences != null) { if (sentences != null) {
// Search word highlighting // Search word highlighting
for (StringBuilder s: sentences) { for (final StringBuilder s: sentences) {
sentence = s.toString(); sentence = s.toString();
if (sentence.trim().length() > 0) { if (sentence.trim().length() > 0) {
prop.put("viewMode_sentences_" + i + "_nr", i + 1); prop.put("viewMode_sentences_" + i + "_nr", i + 1);
@ -282,9 +282,9 @@ public class ViewFile {
if (sentences != null) { if (sentences != null) {
// Search word highlighting // Search word highlighting
for (StringBuilder s: sentences) { for (final StringBuilder s: sentences) {
sentence = s.toString(); sentence = s.toString();
Enumeration<String> tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib); final Enumeration<String> tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib);
while (tokens.hasMoreElements()) { while (tokens.hasMoreElements()) {
token = tokens.nextElement(); token = tokens.nextElement();
if (token.length() > 0) { if (token.length() > 0) {
@ -307,7 +307,7 @@ public class ViewFile {
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0)); i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
dark = (i % 2 == 0); dark = (i % 2 == 0);
final HashMap<MultiProtocolURI, ImageEntry> ts = document.getImages(); final Map<MultiProtocolURI, ImageEntry> ts = document.getImages();
final Iterator<ImageEntry> tsi = ts.values().iterator(); final Iterator<ImageEntry> tsi = ts.values().iterator();
ImageEntry entry; ImageEntry entry;
while (tsi.hasNext()) { while (tsi.hasNext()) {
@ -353,7 +353,7 @@ public class ViewFile {
words = URLDecoder.decode(words, "UTF-8"); words = URLDecoder.decode(words, "UTF-8");
if (words.indexOf(' ') >= 0) return words.split(" "); if (words.indexOf(' ') >= 0) return words.split(" ");
if (words.indexOf(',') >= 0) return words.split(","); if (words.indexOf(',') >= 0) return words.split(",");
if (words.indexOf('+') >= 0) return words.split("+"); if (words.indexOf('+') >= 0) return words.split("\\+");
w = new String[1]; w = new String[1];
w[0] = words; w[0] = words;
} catch (final UnsupportedEncodingException e) {} } catch (final UnsupportedEncodingException e) {}
@ -362,24 +362,23 @@ public class ViewFile {
private static final String markup(final String[] wordArray, String message) { private static final String markup(final String[] wordArray, String message) {
message = CharacterCoding.unicode2html(message, true); message = CharacterCoding.unicode2html(message, true);
if (wordArray != null) if (wordArray != null) {
for (int j = 0; j < wordArray.length; j++) { int j = 0;
final String currentWord = wordArray[j].trim(); for (String currentWord : wordArray) {
currentWord = currentWord.trim();
// TODO: replace upper-/lowercase words as well // TODO: replace upper-/lowercase words as well
message = message.replaceAll(currentWord, message = message.replaceAll(currentWord,
"<span class=\"" + HIGHLIGHT_CSS + ((j % MAX_HIGHLIGHTS) + 1) + "\">" + "<span class=\"" + HIGHLIGHT_CSS + ((j++ % MAX_HIGHLIGHTS) + 1) + "\">" +
currentWord + currentWord +
"</span>"); "</span>");
} }
}
return message; return message;
} }
private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map<MultiProtocolURI, String> media, final String name, boolean dark) { private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map<MultiProtocolURI, String> media, final String name, boolean dark) {
final Iterator<Map.Entry<MultiProtocolURI, String>> mi = media.entrySet().iterator();
Map.Entry<MultiProtocolURI, String> entry;
int i = 0; int i = 0;
while (mi.hasNext()) { for (Map.Entry<MultiProtocolURI, String> entry : media.entrySet()) {
entry = mi.next();
prop.put("viewMode_links_" + c + "_nr", c); prop.put("viewMode_links_" + c + "_nr", c);
prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0)); prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0));
prop.putHTML("viewMode_links_" + c + "_type", name); prop.putHTML("viewMode_links_" + c + "_type", name);

@ -1,4 +1,4 @@
// plasmaCrawlResultImages.java // ResultImages.java
// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net // (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net
// first published 13.04.2008 on http://yacy.net // first published 13.04.2008 on http://yacy.net
// //
@ -26,9 +26,10 @@
package de.anomic.crawler; package de.anomic.crawler;
import java.util.HashMap; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Document; import net.yacy.document.Document;
@ -49,13 +50,13 @@ public class ResultImages {
// we also check all links for a double-check so we don't get the same image more than once in any queue // we also check all links for a double-check so we don't get the same image more than once in any queue
// image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence: // image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence:
// the same images may be linked from different pages // the same images may be linked from different pages
private static final ConcurrentHashMap<MultiProtocolURI, Long> doubleCheck = new ConcurrentHashMap<MultiProtocolURI, Long>(); // (url, time) when the url appeared first private static final ConcurrentMap<MultiProtocolURI, Long> doubleCheck = new ConcurrentHashMap<MultiProtocolURI, Long>(); // (url, time) when the url appeared first
public static void registerImages(final DigestURI source, final Document document, final boolean privateEntry) { public static void registerImages(final DigestURI source, final Document document, final boolean privateEntry) {
if (document == null) return; if (document == null) return;
if (source == null) return; if (source == null) return;
final HashMap<MultiProtocolURI, ImageEntry> images = document.getImages(); final Map<MultiProtocolURI, ImageEntry> images = document.getImages();
for (final ImageEntry image: images.values()) { for (final ImageEntry image: images.values()) {
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup // do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
if (doubleCheck.containsKey(image.url())) continue; if (doubleCheck.containsKey(image.url())) continue;
@ -78,7 +79,7 @@ public class ResultImages {
} else { } else {
ratio = (float) image.height() / (float) image.width(); ratio = (float) image.height() / (float) image.width();
} }
if (ratio < 1.0f || ratio > 2.0f) good = false; good = !(ratio < 1.0f || ratio > 2.0f);
} }
if (good) { if (good) {
if (privateEntry) { if (privateEntry) {

@ -1,11 +1,13 @@
//plasmaParserDocument.java //Document.java
//------------------------ //------------------------
//part of YaCy //part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net //(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de //first published on http://www.anomic.de
//Frankfurt, Germany, 2005 //Frankfurt, Germany, 2005
// //
//last major change: 24.04.2005 // $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
// //
//This program is free software; you can redistribute it and/or modify //This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by //it under the terms of the GNU General Public License as published by
@ -32,7 +34,9 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStreamWriter; import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
@ -69,7 +73,7 @@ public class Document {
private Object text; // the clear text, all that is visible private Object text; // the clear text, all that is visible
private final Map<MultiProtocolURI, String> anchors; // all links embedded as clickeable entities (anchor tags) private final Map<MultiProtocolURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final Map<MultiProtocolURI, String> rss; // all embedded rss feeds private final Map<MultiProtocolURI, String> rss; // all embedded rss feeds
private final HashMap<MultiProtocolURI, ImageEntry> images; // all visible pictures in document private final Map<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings. // the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative // The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags. // text in image tags.
@ -87,7 +91,7 @@ public class Document {
final Object text, final Object text,
final Map<MultiProtocolURI, String> anchors, final Map<MultiProtocolURI, String> anchors,
final Map<MultiProtocolURI, String> rss, final Map<MultiProtocolURI, String> rss,
final HashMap<MultiProtocolURI, ImageEntry> images, final Map<MultiProtocolURI, ImageEntry> images,
boolean indexingDenied) { boolean indexingDenied) {
this.source = location; this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
@ -294,7 +298,7 @@ dc_rights
if (this.text == null) return null; if (this.text == null) return null;
final Condenser.sentencesFromInputStreamEnum e = Condenser.sentencesFromInputStream(getText()); final Condenser.sentencesFromInputStreamEnum e = Condenser.sentencesFromInputStream(getText());
e.pre(pre); e.pre(pre);
ArrayList<StringBuilder> sentences = new ArrayList<StringBuilder>(); List<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (e.hasNext()) { while (e.hasNext()) {
sentences.add(e.next()); sentences.add(e.next());
} }
@ -336,7 +340,7 @@ dc_rights
return this.videolinks; return this.videolinks;
} }
public HashMap<MultiProtocolURI, ImageEntry> getImages() { public Map<MultiProtocolURI, ImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document) // returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection // this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks(); if (!resorted) resortLinks();
@ -368,7 +372,7 @@ dc_rights
audiolinks = new HashMap<MultiProtocolURI, String>(); audiolinks = new HashMap<MultiProtocolURI, String>();
applinks = new HashMap<MultiProtocolURI, String>(); applinks = new HashMap<MultiProtocolURI, String>();
emaillinks = new HashMap<String, String>(); emaillinks = new HashMap<String, String>();
final HashMap<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks final Map<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry<MultiProtocolURI, String> entry; Map.Entry<MultiProtocolURI, String> entry;
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
@ -425,7 +429,7 @@ dc_rights
public static Map<MultiProtocolURI, String> allSubpaths(final Collection<?> links) { public static Map<MultiProtocolURI, String> allSubpaths(final Collection<?> links) {
// links is either a Set of Strings (urls) or a Set of // links is either a Set of Strings (urls) or a Set of
// htmlFilterImageEntries // htmlFilterImageEntries
final HashSet<String> h = new HashSet<String>(); final Set<String> h = new HashSet<String>();
Iterator<?> i = links.iterator(); Iterator<?> i = links.iterator();
Object o; Object o;
MultiProtocolURI url; MultiProtocolURI url;
@ -457,7 +461,7 @@ dc_rights
} catch (final MalformedURLException e) { } } catch (final MalformedURLException e) { }
// now convert the strings to yacyURLs // now convert the strings to yacyURLs
i = h.iterator(); i = h.iterator();
final HashMap<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>(); final Map<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
while (i.hasNext()) { while (i.hasNext()) {
u = (String) i.next(); u = (String) i.next();
try { try {
@ -473,7 +477,7 @@ dc_rights
// links is either a Set of Strings (with urls) or // links is either a Set of Strings (with urls) or
// htmlFilterImageEntries // htmlFilterImageEntries
// we find all links that are part of a reference inside a url // we find all links that are part of a reference inside a url
final HashMap<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>(); final Map<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
final Iterator<?> i = links.iterator(); final Iterator<?> i = links.iterator();
Object o; Object o;
MultiProtocolURI url; MultiProtocolURI url;
@ -567,7 +571,7 @@ dc_rights
return this.indexingDenied; return this.indexingDenied;
} }
public void writeXML(OutputStreamWriter os, Date date) throws IOException { public void writeXML(final Writer os, final Date date) throws IOException {
os.write("<record>\n"); os.write("<record>\n");
String title = this.dc_title(); String title = this.dc_title();
if (title != null && title.length() > 0) os.write("<dc:title><![CDATA[" + title + "]]></dc:title>\n"); if (title != null && title.length() > 0) os.write("<dc:title><![CDATA[" + title + "]]></dc:title>\n");
@ -593,11 +597,11 @@ dc_rights
os.write("</record>\n"); os.write("</record>\n");
} }
@Override
public String toString() { public String toString() {
ByteArrayOutputStream baos = new ByteArrayOutputStream(); final ByteArrayOutputStream baos = new ByteArrayOutputStream();
OutputStreamWriter osw;
try { try {
osw = new OutputStreamWriter(baos, "UTF-8"); final Writer osw = new OutputStreamWriter(baos, "UTF-8");
writeXML(osw, new Date()); writeXML(osw, new Date());
osw.close(); osw.close();
return new String(baos.toByteArray(), "UTF-8"); return new String(baos.toByteArray(), "UTF-8");
@ -631,7 +635,9 @@ dc_rights
* @param docs * @param docs
* @return * @return
*/ */
public static Document mergeDocuments(final MultiProtocolURI location, final String globalMime, Document[] docs) { public static Document mergeDocuments(final MultiProtocolURI location,
final String globalMime, final Document[] docs)
{
if (docs == null || docs.length == 0) return null; if (docs == null || docs.length == 0) return null;
if (docs.length == 1) return docs[0]; if (docs.length == 1) return docs[0];
@ -646,7 +652,7 @@ dc_rights
final Map<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>(); final Map<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, String> rss = new HashMap<MultiProtocolURI, String>(); final Map<MultiProtocolURI, String> rss = new HashMap<MultiProtocolURI, String>();
final HashMap<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>(); final Map<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
for (Document doc: docs) { for (Document doc: docs) {
@ -706,15 +712,17 @@ dc_rights
false); false);
} }
public static Map<MultiProtocolURI, String> getHyperlinks(Document[] documents) { public static Map<MultiProtocolURI, String> getHyperlinks(final Document[] documents) {
Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>(); final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (Document d: documents) result.putAll(d.getHyperlinks()); for (final Document d: documents) {
result.putAll(d.getHyperlinks());
}
return result; return result;
} }
public static Map<MultiProtocolURI, String> getImagelinks(Document[] documents) { public static Map<MultiProtocolURI, String> getImagelinks(final Document[] documents) {
Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>(); final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (Document d: documents) { for (final Document d: documents) {
for (ImageEntry imageReference : d.getImages().values()) { for (ImageEntry imageReference : d.getImages().values()) {
result.put(imageReference.url(), imageReference.alt()); result.put(imageReference.url(), imageReference.alt());
} }

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.04.2009 on http://yacy.net // first published 15.04.2009 on http://yacy.net
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedDate$
// $LastChangedRevision: 1986 $ // $LastChangedRevision$
// $LastChangedBy: orbiter $ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.04.2009 on http://yacy.net // first published 15.04.2009 on http://yacy.net
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedDate$
// $LastChangedRevision: 1986 $ // $LastChangedRevision$
// $LastChangedBy: orbiter $ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 25.05.2009 on http://yacy.net // first published 25.05.2009 on http://yacy.net
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedDate$
// $LastChangedRevision: 1986 $ // $LastChangedRevision$
// $LastChangedBy: orbiter $ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.06.2009 on http://yacy.net // first published 11.06.2009 on http://yacy.net
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedDate$
// $LastChangedRevision: 1986 $ // $LastChangedRevision$
// $LastChangedBy: orbiter $ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 26.05.2009 on http://yacy.net // first published 26.05.2009 on http://yacy.net
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedDate$
// $LastChangedRevision: 1986 $ // $LastChangedRevision$
// $LastChangedBy: orbiter $ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 26.05.2009 on http://yacy.net // first published 26.05.2009 on http://yacy.net
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedDate$
// $LastChangedRevision: 1986 $ // $LastChangedRevision$
// $LastChangedBy: orbiter $ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //

@ -4,9 +4,9 @@
// first published on http://www.yacy.net // first published on http://www.yacy.net
// Braunschweig, Germany, 2008 // Braunschweig, Germany, 2008
// //
// $LastChangedDate: 2008-05-23 23:00:00 +0200 (Fr, 23 Mai 2008) $ // $LastChangedDate$
// $LastChangedRevision: 4824 $ // $LastChangedRevision$
// $LastChangedBy: low012 $ // $LastChangedBy$
// //
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by

@ -4,9 +4,9 @@
// first published on http://www.yacy.net // first published on http://www.yacy.net
// Braunschweig, Germany, 2008 // Braunschweig, Germany, 2008
// //
// $LastChangedDate: 2008-05-18 23:00:00 +0200 (Di, 18 Mai 2008) $ // $LastChangedDate$
// $LastChangedRevision: 4824 $ // $LastChangedRevision$
// $LastChangedBy: low012 $ // $LastChangedBy$
// //
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by

@ -4,9 +4,9 @@
// first published on http://www.yacy.net // first published on http://www.yacy.net
// Braunschweig, Germany, 2008 // Braunschweig, Germany, 2008
// //
// $LastChangedDate: 2008-05-18 23:00:00 +0200 (Di, 18 Mai 2008) $ // $LastChangedDate$
// $LastChangedRevision: 4824 $ // $LastChangedRevision$
// $LastChangedBy: low012 $ // $LastChangedBy$
// //
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by

@ -4,9 +4,9 @@
// first published on http://www.yacy.net // first published on http://www.yacy.net
// Braunschweig, Germany, 2008 // Braunschweig, Germany, 2008
// //
// $LastChangedDate: 2008-05-23 23:00:00 +0200 (Fr, 23 Mai 2008) $ // $LastChangedDate$
// $LastChangedRevision: 4824 $ // $LastChangedRevision$
// $LastChangedBy: low012 $ // $LastChangedBy$
// //
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by

@ -29,8 +29,8 @@
package net.yacy.document.parser.html; package net.yacy.document.parser.html;
import java.util.HashSet;
import java.util.Properties; import java.util.Properties;
import java.util.Set;
public abstract class AbstractScraper implements Scraper { public abstract class AbstractScraper implements Scraper {
@ -38,15 +38,15 @@ public abstract class AbstractScraper implements Scraper {
public static final char rb = '>'; public static final char rb = '>';
public static final char sl = '/'; public static final char sl = '/';
private HashSet<String> tags0; private Set<String> tags0;
private HashSet<String> tags1; private Set<String> tags1;
/** /**
* create a scraper. the tag sets must contain tags in lowercase! * create a scraper. the tag sets must contain tags in lowercase!
* @param tags0 * @param tags0
* @param tags1 * @param tags1
*/ */
public AbstractScraper(final HashSet<String> tags0, final HashSet<String> tags1) { public AbstractScraper(final Set<String> tags0, final Set<String> tags1) {
this.tags0 = tags0; this.tags0 = tags0;
this.tags1 = tags1; this.tags1 = tags1;
} }
@ -68,11 +68,9 @@ public abstract class AbstractScraper implements Scraper {
public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text); public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
protected static String stripAllTags(final char[] s) { protected static String stripAllTags(final char[] s) {
StringBuilder r = new StringBuilder(s.length); final StringBuilder r = new StringBuilder(s.length);
int bc = 0; int bc = 0;
char c; for (final char c : s) {
for (int p = 0; p < s.length; p++) {
c = s[p];
if (c == lb) { if (c == lb) {
bc++; bc++;
r.append(' '); r.append(' ');

@ -4,8 +4,6 @@
// first published on http://www.anomic.de // first published on http://www.anomic.de
// Frankfurt, Germany, 2004 // Frankfurt, Germany, 2004
// //
// Contains contributions by Marc Nause [MN]
//
// $LastChangedDate$ // $LastChangedDate$
// $LastChangedRevision$ // $LastChangedRevision$
// $LastChangedBy$ // $LastChangedBy$
@ -41,6 +39,7 @@ import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
import java.util.Set;
import javax.swing.event.EventListenerList; import javax.swing.event.EventListenerList;
@ -55,8 +54,8 @@ import net.yacy.kelondro.util.ISO639;
public class ContentScraper extends AbstractScraper implements Scraper { public class ContentScraper extends AbstractScraper implements Scraper {
// statics: for initialization of the HTMLFilterAbstractScraper // statics: for initialization of the HTMLFilterAbstractScraper
private static final HashSet<String> linkTags0 = new HashSet<String>(9,0.99f); private static final Set<String> linkTags0 = new HashSet<String>(9,0.99f);
private static final HashSet<String> linkTags1 = new HashSet<String>(7,0.99f); private static final Set<String> linkTags1 = new HashSet<String>(7,0.99f);
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase // all these tags must be given in lowercase, because the tags from the files are compared in lowercase
static { static {
@ -79,10 +78,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
// class variables: collectors for links // class variables: collectors for links
private HashMap<MultiProtocolURI, String> rss; private Map<MultiProtocolURI, String> rss;
private HashMap<MultiProtocolURI, String> anchors; private Map<MultiProtocolURI, String> anchors;
private HashMap<MultiProtocolURI, ImageEntry> images; // urlhash/image relation private Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final HashMap<String, String> metas; private final Map<String, String> metas;
private String title; private String title;
//private String headline; //private String headline;
private List<String>[] headlines; private List<String>[] headlines;
@ -153,8 +152,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (b.length() != 0) content.append(b).append(32); if (b.length() != 0) content.append(b).append(32);
} }
private static final int find(final String s, final String m, int start) { private static final int find(final String s, final String m, final int start) {
int p = s.indexOf(m, start); final int p = s.indexOf(m, start);
return (p < 0) ? Integer.MAX_VALUE : p; return (p < 0) ? Integer.MAX_VALUE : p;
} }
@ -185,14 +184,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// addImage(images, ie); // addImage(images, ie);
} }
} catch (final NumberFormatException e) {} } catch (final NumberFormatException e) {}
} } else if(tagname.equalsIgnoreCase("base")) {
if (tagname.equalsIgnoreCase("base")) try { try {
root = new MultiProtocolURI(tagopts.getProperty("href", "")); root = new MultiProtocolURI(tagopts.getProperty("href", ""));
} catch (final MalformedURLException e) {} } catch (final MalformedURLException e) {}
if (tagname.equalsIgnoreCase("frame")) { } else if (tagname.equalsIgnoreCase("frame")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name","")); anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
} } else if (tagname.equalsIgnoreCase("meta")) {
if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", ""); String name = tagopts.getProperty("name", "");
if (name.length() > 0) { if (name.length() > 0) {
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content",""))); metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content","")));
@ -202,14 +200,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content",""))); metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content","")));
} }
} }
} } else if (tagname.equalsIgnoreCase("area")) {
if (tagname.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tagopts.getProperty("title","")); final String areatitle = cleanLine(tagopts.getProperty("title",""));
//String alt = tagopts.getProperty("alt",""); //String alt = tagopts.getProperty("alt","");
final String href = tagopts.getProperty("href", ""); final String href = tagopts.getProperty("href", "");
if (href.length() > 0) anchors.put(absolutePath(href), areatitle); if (href.length() > 0) anchors.put(absolutePath(href), areatitle);
} } else if (tagname.equalsIgnoreCase("link")) {
if (tagname.equalsIgnoreCase("link")) {
final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", "")); final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", ""));
if (newLink != null) { if (newLink != null) {
@ -227,18 +223,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
anchors.put(newLink, linktitle); anchors.put(newLink, linktitle);
} }
} }
} } else if(tagname.equalsIgnoreCase("embed")) {
//start contrib [MN]
if (tagname.equalsIgnoreCase("embed")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name","")); anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
} } else if(tagname.equalsIgnoreCase("param")) {
if (tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", ""); final String name = tagopts.getProperty("name", "");
if (name.equalsIgnoreCase("movie")) { if (name.equalsIgnoreCase("movie")) {
anchors.put(absolutePath(tagopts.getProperty("value", "")),name); anchors.put(absolutePath(tagopts.getProperty("value", "")),name);
} }
} }
//end contrib [MN]
// fire event // fire event
fireScrapeTag0(tagname, tagopts); fireScrapeTag0(tagname, tagopts);
@ -262,24 +254,20 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
} }
} }
String h; final String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
h = recursiveParse(text); h = recursiveParse(text);
if (h.length() > 0) headlines[0].add(h); if (h.length() > 0) headlines[0].add(h);
} } else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
h = recursiveParse(text); h = recursiveParse(text);
if (h.length() > 0) headlines[1].add(h); if (h.length() > 0) headlines[1].add(h);
} } else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
h = recursiveParse(text); h = recursiveParse(text);
if (h.length() > 0) headlines[2].add(h); if (h.length() > 0) headlines[2].add(h);
} } else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
h = recursiveParse(text); h = recursiveParse(text);
if (h.length() > 0) headlines[3].add(h); if (h.length() > 0) headlines[3].add(h);
} } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
title = recursiveParse(text); title = recursiveParse(text);
} }
@ -287,7 +275,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
fireScrapeTag1(tagname, tagopts, text); fireScrapeTag1(tagname, tagopts, text);
} }
private String recursiveParse(char[] inlineHtml) { private String recursiveParse(final char[] inlineHtml) {
if (inlineHtml.length < 14) return cleanLine(super.stripAll(inlineHtml)); if (inlineHtml.length < 14) return cleanLine(super.stripAll(inlineHtml));
// start a new scraper to parse links inside this text // start a new scraper to parse links inside this text
@ -307,11 +295,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return cleanLine(super.stripAll(scraper.content.getChars())); return cleanLine(super.stripAll(scraper.content.getChars()));
} }
private final static String cleanLine(String s) { private final static String cleanLine(final String s) {
StringBuilder sb = new StringBuilder(s.length()); final StringBuilder sb = new StringBuilder(s.length());
char c, l = ' '; char l = ' ';
for (int i = 0; i < s.length(); i++) { for (char c : s.toCharArray()) {
c = s.charAt(i);
if (c < ' ') c = ' '; if (c < ' ') c = ' ';
if (c == ' ') { if (c == ' ') {
if (l != ' ') sb.append(c); if (l != ' ') sb.append(c);
@ -358,9 +345,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String[] getHeadlines(final int i) { public String[] getHeadlines(final int i) {
assert ((i >= 1) && (i <= 4)); assert ((i >= 1) && (i <= 4));
final String[] s = new String[headlines[i - 1].size()]; return headlines[i - 1].toArray(new String[headlines.length]);
for (int j = 0; j < headlines[i - 1].size(); j++) s[j] = headlines[i - 1].get(j);
return s;
} }
public byte[] getText() { public byte[] getText() {
@ -389,7 +374,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* get all images * get all images
* @return a map of <urlhash, ImageEntry> * @return a map of <urlhash, ImageEntry>
*/ */
public HashMap<MultiProtocolURI, ImageEntry> getImages() { public Map<MultiProtocolURI, ImageEntry> getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation // this resturns a String(absolute url)/htmlFilterImageEntry - relation
return images; return images;
} }
@ -448,13 +433,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return s; return s;
} }
public HashSet<String> getContentLanguages() { public Set<String> getContentLanguages() {
// i.e. <meta name="DC.language" content="en" scheme="DCTERMS.RFC3066"> // i.e. <meta name="DC.language" content="en" scheme="DCTERMS.RFC3066">
// or <meta http-equiv="content-language" content="en"> // or <meta http-equiv="content-language" content="en">
String s = metas.get("content-language"); String s = metas.get("content-language");
if (s == null) s = metas.get("dc.language"); if (s == null) s = metas.get("dc.language");
if (s == null) return null; if (s == null) return null;
HashSet<String> hs = new HashSet<String>(); Set<String> hs = new HashSet<String>();
String[] cl = s.split(" |,"); String[] cl = s.split(" |,");
int p; int p;
for (int i = 0; i < cl.length; i++) { for (int i = 0; i < cl.length; i++) {
@ -579,7 +564,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return scraper; return scraper;
} }
public static void addAllImages(final HashMap<MultiProtocolURI, ImageEntry> a, final HashMap<MultiProtocolURI, ImageEntry> b) { public static void addAllImages(final Map<MultiProtocolURI, ImageEntry> a, final Map<MultiProtocolURI, ImageEntry> b) {
final Iterator<Map.Entry<MultiProtocolURI, ImageEntry>> i = b.entrySet().iterator(); final Iterator<Map.Entry<MultiProtocolURI, ImageEntry>> i = b.entrySet().iterator();
Map.Entry<MultiProtocolURI, ImageEntry> ie; Map.Entry<MultiProtocolURI, ImageEntry> ie;
while (i.hasNext()) { while (i.hasNext()) {
@ -588,7 +573,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
} }
public static void addImage(final HashMap<MultiProtocolURI, ImageEntry> a, final ImageEntry ie) { public static void addImage(final Map<MultiProtocolURI, ImageEntry> a, final ImageEntry ie) {
if (a.containsKey(ie.url())) { if (a.containsKey(ie.url())) {
// in case of a collision, take that image that has the better image size tags // in case of a collision, take that image that has the better image size tags
if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie); if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie);

@ -2,9 +2,9 @@
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.07.2007 on http://yacy.net // first published 15.07.2007 on http://yacy.net
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedDate$
// $LastChangedRevision: 1986 $ // $LastChangedRevision$
// $LastChangedBy: orbiter $ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //

@ -4,9 +4,9 @@
// //
// This is a part of YaCy, a peer-to-peer based web search engine // This is a part of YaCy, a peer-to-peer based web search engine
// //
// $LastChangedDate: 2009-10-11 02:12:19 +0200 (So, 11 Okt 2009) $ // $LastChangedDate$
// $LastChangedRevision: 6398 $ // $LastChangedRevision$
// $LastChangedBy: orbiter $ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //

@ -2,9 +2,9 @@
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.07.2007 on http://yacy.net // first published 15.07.2007 on http://yacy.net
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedDate$
// $LastChangedRevision: 1986 $ // $LastChangedRevision$
// $LastChangedBy: orbiter $ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //

Loading…
Cancel
Save