*) cleaning up the code a little bit

*) minor changes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7396 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
low012 14 years ago
parent 2a6499364d
commit 3d95981f7d

@ -5,7 +5,9 @@
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//last major change: 12.07.2004
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
@ -31,7 +33,6 @@ import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@ -85,7 +86,6 @@ public class ViewFile {
return prop;
}
final int display = post.getInt("display", 1);
// get segment
@ -209,7 +209,7 @@ public class ViewFile {
} else if (viewMode.equals("iframeCache")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CACHE);
String ext = url.getFileExtension();
final String ext = url.getFileExtension();
if ("jpg.jpeg.png.gif".indexOf(ext) >= 0) {
prop.put("viewMode_png", 1);
prop.put("viewMode_png_url", url.toNormalform(false, true));
@ -259,7 +259,7 @@ public class ViewFile {
if (sentences != null) {
// Search word highlighting
for (StringBuilder s: sentences) {
for (final StringBuilder s: sentences) {
sentence = s.toString();
if (sentence.trim().length() > 0) {
prop.put("viewMode_sentences_" + i + "_nr", i + 1);
@ -282,9 +282,9 @@ public class ViewFile {
if (sentences != null) {
// Search word highlighting
for (StringBuilder s: sentences) {
for (final StringBuilder s: sentences) {
sentence = s.toString();
Enumeration<String> tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib);
final Enumeration<String> tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib);
while (tokens.hasMoreElements()) {
token = tokens.nextElement();
if (token.length() > 0) {
@ -307,7 +307,7 @@ public class ViewFile {
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
dark = (i % 2 == 0);
final HashMap<MultiProtocolURI, ImageEntry> ts = document.getImages();
final Map<MultiProtocolURI, ImageEntry> ts = document.getImages();
final Iterator<ImageEntry> tsi = ts.values().iterator();
ImageEntry entry;
while (tsi.hasNext()) {
@ -353,7 +353,7 @@ public class ViewFile {
words = URLDecoder.decode(words, "UTF-8");
if (words.indexOf(' ') >= 0) return words.split(" ");
if (words.indexOf(',') >= 0) return words.split(",");
if (words.indexOf('+') >= 0) return words.split("+");
if (words.indexOf('+') >= 0) return words.split("\\+");
w = new String[1];
w[0] = words;
} catch (final UnsupportedEncodingException e) {}
@ -362,24 +362,23 @@ public class ViewFile {
private static final String markup(final String[] wordArray, String message) {
message = CharacterCoding.unicode2html(message, true);
if (wordArray != null)
for (int j = 0; j < wordArray.length; j++) {
final String currentWord = wordArray[j].trim();
if (wordArray != null) {
int j = 0;
for (String currentWord : wordArray) {
currentWord = currentWord.trim();
// TODO: replace upper-/lowercase words as well
message = message.replaceAll(currentWord,
"<span class=\"" + HIGHLIGHT_CSS + ((j % MAX_HIGHLIGHTS) + 1) + "\">" +
"<span class=\"" + HIGHLIGHT_CSS + ((j++ % MAX_HIGHLIGHTS) + 1) + "\">" +
currentWord +
"</span>");
}
}
return message;
}
private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map<MultiProtocolURI, String> media, final String name, boolean dark) {
final Iterator<Map.Entry<MultiProtocolURI, String>> mi = media.entrySet().iterator();
Map.Entry<MultiProtocolURI, String> entry;
int i = 0;
while (mi.hasNext()) {
entry = mi.next();
for (Map.Entry<MultiProtocolURI, String> entry : media.entrySet()) {
prop.put("viewMode_links_" + c + "_nr", c);
prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0));
prop.putHTML("viewMode_links_" + c + "_type", name);

@ -1,4 +1,4 @@
// plasmaCrawlResultImages.java
// ResultImages.java
// (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net
// first published 13.04.2008 on http://yacy.net
//
@ -26,9 +26,10 @@
package de.anomic.crawler;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Document;
@ -49,13 +50,13 @@ public class ResultImages {
// we also check all links for a double-check so we don't get the same image more than once in any queue
// image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence:
// the same images may be linked from different pages
private static final ConcurrentHashMap<MultiProtocolURI, Long> doubleCheck = new ConcurrentHashMap<MultiProtocolURI, Long>(); // (url, time) when the url appeared first
private static final ConcurrentMap<MultiProtocolURI, Long> doubleCheck = new ConcurrentHashMap<MultiProtocolURI, Long>(); // (url, time) when the url appeared first
public static void registerImages(final DigestURI source, final Document document, final boolean privateEntry) {
if (document == null) return;
if (source == null) return;
final HashMap<MultiProtocolURI, ImageEntry> images = document.getImages();
final Map<MultiProtocolURI, ImageEntry> images = document.getImages();
for (final ImageEntry image: images.values()) {
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
if (doubleCheck.containsKey(image.url())) continue;
@ -78,7 +79,7 @@ public class ResultImages {
} else {
ratio = (float) image.height() / (float) image.width();
}
if (ratio < 1.0f || ratio > 2.0f) good = false;
good = !(ratio < 1.0f || ratio > 2.0f);
}
if (good) {
if (privateEntry) {

@ -1,11 +1,13 @@
//plasmaParserDocument.java
//Document.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//last major change: 24.04.2005
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
@ -32,7 +34,9 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@ -69,7 +73,7 @@ public class Document {
private Object text; // the clear text, all that is visible
private final Map<MultiProtocolURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final Map<MultiProtocolURI, String> rss; // all embedded rss feeds
private final HashMap<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
private final Map<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
@ -87,7 +91,7 @@ public class Document {
final Object text,
final Map<MultiProtocolURI, String> anchors,
final Map<MultiProtocolURI, String> rss,
final HashMap<MultiProtocolURI, ImageEntry> images,
final Map<MultiProtocolURI, ImageEntry> images,
boolean indexingDenied) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
@ -294,7 +298,7 @@ dc_rights
if (this.text == null) return null;
final Condenser.sentencesFromInputStreamEnum e = Condenser.sentencesFromInputStream(getText());
e.pre(pre);
ArrayList<StringBuilder> sentences = new ArrayList<StringBuilder>();
List<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (e.hasNext()) {
sentences.add(e.next());
}
@ -336,7 +340,7 @@ dc_rights
return this.videolinks;
}
public HashMap<MultiProtocolURI, ImageEntry> getImages() {
public Map<MultiProtocolURI, ImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
@ -368,7 +372,7 @@ dc_rights
audiolinks = new HashMap<MultiProtocolURI, String>();
applinks = new HashMap<MultiProtocolURI, String>();
emaillinks = new HashMap<String, String>();
final HashMap<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
final Map<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
Map.Entry<MultiProtocolURI, String> entry;
while (i.hasNext()) {
entry = i.next();
@ -425,7 +429,7 @@ dc_rights
public static Map<MultiProtocolURI, String> allSubpaths(final Collection<?> links) {
// links is either a Set of Strings (urls) or a Set of
// htmlFilterImageEntries
final HashSet<String> h = new HashSet<String>();
final Set<String> h = new HashSet<String>();
Iterator<?> i = links.iterator();
Object o;
MultiProtocolURI url;
@ -457,7 +461,7 @@ dc_rights
} catch (final MalformedURLException e) { }
// now convert the strings to yacyURLs
i = h.iterator();
final HashMap<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
while (i.hasNext()) {
u = (String) i.next();
try {
@ -473,7 +477,7 @@ dc_rights
// links is either a Set of Strings (with urls) or
// htmlFilterImageEntries
// we find all links that are part of a reference inside a url
final HashMap<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, String> v = new HashMap<MultiProtocolURI, String>();
final Iterator<?> i = links.iterator();
Object o;
MultiProtocolURI url;
@ -567,7 +571,7 @@ dc_rights
return this.indexingDenied;
}
public void writeXML(OutputStreamWriter os, Date date) throws IOException {
public void writeXML(final Writer os, final Date date) throws IOException {
os.write("<record>\n");
String title = this.dc_title();
if (title != null && title.length() > 0) os.write("<dc:title><![CDATA[" + title + "]]></dc:title>\n");
@ -593,11 +597,11 @@ dc_rights
os.write("</record>\n");
}
@Override
public String toString() {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
OutputStreamWriter osw;
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
try {
osw = new OutputStreamWriter(baos, "UTF-8");
final Writer osw = new OutputStreamWriter(baos, "UTF-8");
writeXML(osw, new Date());
osw.close();
return new String(baos.toByteArray(), "UTF-8");
@ -631,7 +635,9 @@ dc_rights
* @param docs
* @return
*/
public static Document mergeDocuments(final MultiProtocolURI location, final String globalMime, Document[] docs) {
public static Document mergeDocuments(final MultiProtocolURI location,
final String globalMime, final Document[] docs)
{
if (docs == null || docs.length == 0) return null;
if (docs.length == 1) return docs[0];
@ -646,7 +652,7 @@ dc_rights
final Map<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, String> rss = new HashMap<MultiProtocolURI, String>();
final HashMap<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
final Map<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
for (Document doc: docs) {
@ -706,15 +712,17 @@ dc_rights
false);
}
public static Map<MultiProtocolURI, String> getHyperlinks(Document[] documents) {
Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (Document d: documents) result.putAll(d.getHyperlinks());
public static Map<MultiProtocolURI, String> getHyperlinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) {
result.putAll(d.getHyperlinks());
}
return result;
}
public static Map<MultiProtocolURI, String> getImagelinks(Document[] documents) {
Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (Document d: documents) {
public static Map<MultiProtocolURI, String> getImagelinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) {
for (ImageEntry imageReference : d.getImages().values()) {
result.put(imageReference.url(), imageReference.alt());
}

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.04.2009 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.04.2009 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 25.05.2009 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.06.2009 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 26.05.2009 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//

@ -2,9 +2,9 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 26.05.2009 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//

@ -4,9 +4,9 @@
// first published on http://www.yacy.net
// Braunschweig, Germany, 2008
//
// $LastChangedDate: 2008-05-23 23:00:00 +0200 (Fr, 23 Mai 2008) $
// $LastChangedRevision: 4824 $
// $LastChangedBy: low012 $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by

@ -4,9 +4,9 @@
// first published on http://www.yacy.net
// Braunschweig, Germany, 2008
//
// $LastChangedDate: 2008-05-18 23:00:00 +0200 (Di, 18 Mai 2008) $
// $LastChangedRevision: 4824 $
// $LastChangedBy: low012 $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by

@ -4,9 +4,9 @@
// first published on http://www.yacy.net
// Braunschweig, Germany, 2008
//
// $LastChangedDate: 2008-05-18 23:00:00 +0200 (Di, 18 Mai 2008) $
// $LastChangedRevision: 4824 $
// $LastChangedBy: low012 $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by

@ -4,9 +4,9 @@
// first published on http://www.yacy.net
// Braunschweig, Germany, 2008
//
// $LastChangedDate: 2008-05-23 23:00:00 +0200 (Fr, 23 Mai 2008) $
// $LastChangedRevision: 4824 $
// $LastChangedBy: low012 $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by

@ -29,8 +29,8 @@
package net.yacy.document.parser.html;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;
public abstract class AbstractScraper implements Scraper {
@ -38,15 +38,15 @@ public abstract class AbstractScraper implements Scraper {
public static final char rb = '>';
public static final char sl = '/';
private HashSet<String> tags0;
private HashSet<String> tags1;
private Set<String> tags0;
private Set<String> tags1;
/**
* create a scraper. the tag sets must contain tags in lowercase!
* @param tags0
* @param tags1
*/
public AbstractScraper(final HashSet<String> tags0, final HashSet<String> tags1) {
public AbstractScraper(final Set<String> tags0, final Set<String> tags1) {
this.tags0 = tags0;
this.tags1 = tags1;
}
@ -68,11 +68,9 @@ public abstract class AbstractScraper implements Scraper {
public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text);
protected static String stripAllTags(final char[] s) {
StringBuilder r = new StringBuilder(s.length);
final StringBuilder r = new StringBuilder(s.length);
int bc = 0;
char c;
for (int p = 0; p < s.length; p++) {
c = s[p];
for (final char c : s) {
if (c == lb) {
bc++;
r.append(' ');

@ -4,8 +4,6 @@
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// Contains contributions by Marc Nause [MN]
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
@ -41,6 +39,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import javax.swing.event.EventListenerList;
@ -55,8 +54,8 @@ import net.yacy.kelondro.util.ISO639;
public class ContentScraper extends AbstractScraper implements Scraper {
// statics: for initialization of the HTMLFilterAbstractScraper
private static final HashSet<String> linkTags0 = new HashSet<String>(9,0.99f);
private static final HashSet<String> linkTags1 = new HashSet<String>(7,0.99f);
private static final Set<String> linkTags0 = new HashSet<String>(9,0.99f);
private static final Set<String> linkTags1 = new HashSet<String>(7,0.99f);
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase
static {
@ -79,10 +78,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
// class variables: collectors for links
private HashMap<MultiProtocolURI, String> rss;
private HashMap<MultiProtocolURI, String> anchors;
private HashMap<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final HashMap<String, String> metas;
private Map<MultiProtocolURI, String> rss;
private Map<MultiProtocolURI, String> anchors;
private Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final Map<String, String> metas;
private String title;
//private String headline;
private List<String>[] headlines;
@ -153,8 +152,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (b.length() != 0) content.append(b).append(32);
}
private static final int find(final String s, final String m, int start) {
int p = s.indexOf(m, start);
private static final int find(final String s, final String m, final int start) {
final int p = s.indexOf(m, start);
return (p < 0) ? Integer.MAX_VALUE : p;
}
@ -185,14 +184,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// addImage(images, ie);
}
} catch (final NumberFormatException e) {}
}
if (tagname.equalsIgnoreCase("base")) try {
root = new MultiProtocolURI(tagopts.getProperty("href", ""));
} catch (final MalformedURLException e) {}
if (tagname.equalsIgnoreCase("frame")) {
} else if(tagname.equalsIgnoreCase("base")) {
try {
root = new MultiProtocolURI(tagopts.getProperty("href", ""));
} catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
}
if (tagname.equalsIgnoreCase("meta")) {
} else if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", "");
if (name.length() > 0) {
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content","")));
@ -202,14 +200,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content","")));
}
}
}
if (tagname.equalsIgnoreCase("area")) {
} else if (tagname.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tagopts.getProperty("title",""));
//String alt = tagopts.getProperty("alt","");
final String href = tagopts.getProperty("href", "");
if (href.length() > 0) anchors.put(absolutePath(href), areatitle);
}
if (tagname.equalsIgnoreCase("link")) {
} else if (tagname.equalsIgnoreCase("link")) {
final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", ""));
if (newLink != null) {
@ -227,18 +223,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
anchors.put(newLink, linktitle);
}
}
}
//start contrib [MN]
if (tagname.equalsIgnoreCase("embed")) {
} else if(tagname.equalsIgnoreCase("embed")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
}
if (tagname.equalsIgnoreCase("param")) {
} else if(tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", "");
if (name.equalsIgnoreCase("movie")) {
anchors.put(absolutePath(tagopts.getProperty("value", "")),name);
}
}
//end contrib [MN]
// fire event
fireScrapeTag0(tagname, tagopts);
@ -262,24 +254,20 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
}
String h;
final String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[0].add(h);
}
if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
} else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[1].add(h);
}
if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
} else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[2].add(h);
}
if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
} else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[3].add(h);
}
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
} else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
title = recursiveParse(text);
}
@ -287,7 +275,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
fireScrapeTag1(tagname, tagopts, text);
}
private String recursiveParse(char[] inlineHtml) {
private String recursiveParse(final char[] inlineHtml) {
if (inlineHtml.length < 14) return cleanLine(super.stripAll(inlineHtml));
// start a new scraper to parse links inside this text
@ -307,11 +295,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return cleanLine(super.stripAll(scraper.content.getChars()));
}
private final static String cleanLine(String s) {
StringBuilder sb = new StringBuilder(s.length());
char c, l = ' ';
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
private final static String cleanLine(final String s) {
final StringBuilder sb = new StringBuilder(s.length());
char l = ' ';
for (char c : s.toCharArray()) {
if (c < ' ') c = ' ';
if (c == ' ') {
if (l != ' ') sb.append(c);
@ -358,9 +345,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String[] getHeadlines(final int i) {
assert ((i >= 1) && (i <= 4));
final String[] s = new String[headlines[i - 1].size()];
for (int j = 0; j < headlines[i - 1].size(); j++) s[j] = headlines[i - 1].get(j);
return s;
return headlines[i - 1].toArray(new String[headlines.length]);
}
public byte[] getText() {
@ -389,7 +374,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* get all images
* @return a map of <urlhash, ImageEntry>
*/
public HashMap<MultiProtocolURI, ImageEntry> getImages() {
public Map<MultiProtocolURI, ImageEntry> getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
return images;
}
@ -448,13 +433,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return s;
}
public HashSet<String> getContentLanguages() {
public Set<String> getContentLanguages() {
// i.e. <meta name="DC.language" content="en" scheme="DCTERMS.RFC3066">
// or <meta http-equiv="content-language" content="en">
String s = metas.get("content-language");
if (s == null) s = metas.get("dc.language");
if (s == null) return null;
HashSet<String> hs = new HashSet<String>();
Set<String> hs = new HashSet<String>();
String[] cl = s.split(" |,");
int p;
for (int i = 0; i < cl.length; i++) {
@ -579,7 +564,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return scraper;
}
public static void addAllImages(final HashMap<MultiProtocolURI, ImageEntry> a, final HashMap<MultiProtocolURI, ImageEntry> b) {
public static void addAllImages(final Map<MultiProtocolURI, ImageEntry> a, final Map<MultiProtocolURI, ImageEntry> b) {
final Iterator<Map.Entry<MultiProtocolURI, ImageEntry>> i = b.entrySet().iterator();
Map.Entry<MultiProtocolURI, ImageEntry> ie;
while (i.hasNext()) {
@ -588,7 +573,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
public static void addImage(final HashMap<MultiProtocolURI, ImageEntry> a, final ImageEntry ie) {
public static void addImage(final Map<MultiProtocolURI, ImageEntry> a, final ImageEntry ie) {
if (a.containsKey(ie.url())) {
// in case of a collision, take that image that has the better image size tags
if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie);

@ -2,9 +2,9 @@
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.07.2007 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//

@ -4,9 +4,9 @@
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-10-11 02:12:19 +0200 (So, 11 Okt 2009) $
// $LastChangedRevision: 6398 $
// $LastChangedBy: orbiter $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//

@ -2,9 +2,9 @@
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.07.2007 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//

Loading…
Cancel
Save