From 3d95981f7d211b7935d9cc84bb29ffcb6ec51bf8 Mon Sep 17 00:00:00 2001 From: low012 Date: Mon, 27 Dec 2010 17:07:21 +0000 Subject: [PATCH] *) cleaning up the code a little bit *) minor changes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7396 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ViewFile.java | 33 ++++--- source/de/anomic/crawler/ResultImages.java | 11 +-- source/net/yacy/document/Document.java | 52 +++++++----- source/net/yacy/document/content/DCEntry.java | 6 +- .../document/content/SurrogateReader.java | 6 +- source/net/yacy/document/content/dao/Dao.java | 6 +- .../content/dao/DatabaseConnection.java | 6 +- .../yacy/document/content/dao/ImportDump.java | 6 +- .../yacy/document/content/dao/PhpBB3Dao.java | 6 +- .../yacy/document/language/Identificator.java | 6 +- .../language/LanguageFilenameFilter.java | 6 +- .../document/language/LanguageStatistics.java | 6 +- .../language/LanguageStatisticsHolder.java | 6 +- .../document/parser/html/AbstractScraper.java | 14 ++- .../document/parser/html/ContentScraper.java | 85 ++++++++----------- .../document/parser/images/bmpParser.java | 6 +- .../parser/images/genericImageParser.java | 6 +- .../document/parser/images/icoParser.java | 6 +- 18 files changed, 132 insertions(+), 141 deletions(-) diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index ab48a4ab2..a274c43bb 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -5,7 +5,9 @@ //first published on http://www.anomic.de //Frankfurt, Germany, 2004 -//last major change: 12.07.2004 +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ //This program is free software; you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by @@ -31,7 +33,6 @@ import java.net.MalformedURLException; import java.net.URLDecoder; import java.util.Collection; import java.util.Enumeration; -import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -85,7 +86,6 @@ public class ViewFile { return prop; } - final int display = post.getInt("display", 1); // get segment @@ -209,7 +209,7 @@ public class ViewFile { } else if (viewMode.equals("iframeCache")) { prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CACHE); - String ext = url.getFileExtension(); + final String ext = url.getFileExtension(); if ("jpg.jpeg.png.gif".indexOf(ext) >= 0) { prop.put("viewMode_png", 1); prop.put("viewMode_png_url", url.toNormalform(false, true)); @@ -259,7 +259,7 @@ public class ViewFile { if (sentences != null) { // Search word highlighting - for (StringBuilder s: sentences) { + for (final StringBuilder s: sentences) { sentence = s.toString(); if (sentence.trim().length() > 0) { prop.put("viewMode_sentences_" + i + "_nr", i + 1); @@ -282,9 +282,9 @@ public class ViewFile { if (sentences != null) { // Search word highlighting - for (StringBuilder s: sentences) { + for (final StringBuilder s: sentences) { sentence = s.toString(); - Enumeration tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib); + final Enumeration tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib); while (tokens.hasMoreElements()) { token = tokens.nextElement(); if (token.length() > 0) { @@ -307,7 +307,7 @@ public class ViewFile { i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0)); dark = (i % 2 == 0); - final HashMap ts = document.getImages(); + final Map ts = document.getImages(); final Iterator tsi = ts.values().iterator(); ImageEntry entry; while (tsi.hasNext()) { @@ -353,7 +353,7 @@ public class ViewFile { words = URLDecoder.decode(words, "UTF-8"); if (words.indexOf(' ') >= 0) return words.split(" "); if (words.indexOf(',') >= 0) return words.split(","); - if (words.indexOf('+') >= 0) return words.split("+"); + if (words.indexOf('+') >= 0) return words.split("\\+"); w = new String[1]; w[0] = words; } catch (final UnsupportedEncodingException e) {} @@ -362,24 +362,23 @@ public class ViewFile { private static final String markup(final String[] wordArray, String message) { message = CharacterCoding.unicode2html(message, true); - if (wordArray != null) - for (int j = 0; j < wordArray.length; j++) { - final String currentWord = wordArray[j].trim(); + if (wordArray != null) { + int j = 0; + for (String currentWord : wordArray) { + currentWord = currentWord.trim(); // TODO: replace upper-/lowercase words as well message = message.replaceAll(currentWord, - "" + + "" + currentWord + ""); } + } return message; } private static int putMediaInfo(final serverObjects prop, final String[] wordArray, int c, final Map media, final String name, boolean dark) { - final Iterator> mi = media.entrySet().iterator(); - Map.Entry entry; int i = 0; - while (mi.hasNext()) { - entry = mi.next(); + for (Map.Entry entry : media.entrySet()) { prop.put("viewMode_links_" + c + "_nr", c); prop.put("viewMode_links_" + c + "_dark", ((dark) ? 1 : 0)); prop.putHTML("viewMode_links_" + c + "_type", name); diff --git a/source/de/anomic/crawler/ResultImages.java b/source/de/anomic/crawler/ResultImages.java index 352c961ea..04994366c 100755 --- a/source/de/anomic/crawler/ResultImages.java +++ b/source/de/anomic/crawler/ResultImages.java @@ -1,4 +1,4 @@ -// plasmaCrawlResultImages.java +// ResultImages.java // (C) 2008 by by Detlef Reichl; detlef!reichl()gmx!org and Michael Peter Christen; mc@yacy.net // first published 13.04.2008 on http://yacy.net // @@ -26,9 +26,10 @@ package de.anomic.crawler; -import java.util.HashMap; +import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ConcurrentMap; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.Document; @@ -49,13 +50,13 @@ public class ResultImages { // we also check all links for a double-check so we don't get the same image more than once in any queue // image links may appear double here even if the pages where the image links are embedded already are checked for double-occurrence: // the same images may be linked from different pages - private static final ConcurrentHashMap doubleCheck = new ConcurrentHashMap(); // (url, time) when the url appeared first + private static final ConcurrentMap doubleCheck = new ConcurrentHashMap(); // (url, time) when the url appeared first public static void registerImages(final DigestURI source, final Document document, final boolean privateEntry) { if (document == null) return; if (source == null) return; - final HashMap images = document.getImages(); + final Map images = document.getImages(); for (final ImageEntry image: images.values()) { // do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup if (doubleCheck.containsKey(image.url())) continue; @@ -78,7 +79,7 @@ public class ResultImages { } else { ratio = (float) image.height() / (float) image.width(); } - if (ratio < 1.0f || ratio > 2.0f) good = false; + good = !(ratio < 1.0f || ratio > 2.0f); } if (good) { if (privateEntry) { diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index d6efdd64f..b6399216a 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -1,11 +1,13 @@ -//plasmaParserDocument.java +//Document.java //------------------------ //part of YaCy //(C) by Michael Peter Christen; mc@yacy.net //first published on http://www.anomic.de //Frankfurt, Germany, 2005 // -//last major change: 24.04.2005 +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // //This program is free software; you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by @@ -32,7 +34,9 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; +import java.io.Writer; import java.net.MalformedURLException; +import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -69,7 +73,7 @@ public class Document { private Object text; // the clear text, all that is visible private final Map anchors; // all links embedded as clickeable entities (anchor tags) private final Map rss; // all embedded rss feeds - private final HashMap images; // all visible pictures in document + private final Map images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. @@ -87,7 +91,7 @@ public class Document { final Object text, final Map anchors, final Map rss, - final HashMap images, + final Map images, boolean indexingDenied) { this.source = location; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; @@ -294,7 +298,7 @@ dc_rights if (this.text == null) return null; final Condenser.sentencesFromInputStreamEnum e = Condenser.sentencesFromInputStream(getText()); e.pre(pre); - ArrayList sentences = new ArrayList(); + List sentences = new ArrayList(); while (e.hasNext()) { sentences.add(e.next()); } @@ -336,7 +340,7 @@ dc_rights return this.videolinks; } - public HashMap getImages() { + public Map getImages() { // returns all links enbedded as pictures (visible in document) // this resturns a htmlFilterImageEntry collection if (!resorted) resortLinks(); @@ -368,7 +372,7 @@ dc_rights audiolinks = new HashMap(); applinks = new HashMap(); emaillinks = new HashMap(); - final HashMap collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks + final Map collectedImages = new HashMap(); // this is a set that is collected now and joined later to the imagelinks Map.Entry entry; while (i.hasNext()) { entry = i.next(); @@ -425,7 +429,7 @@ dc_rights public static Map allSubpaths(final Collection links) { // links is either a Set of Strings (urls) or a Set of // htmlFilterImageEntries - final HashSet h = new HashSet(); + final Set h = new HashSet(); Iterator i = links.iterator(); Object o; MultiProtocolURI url; @@ -457,7 +461,7 @@ dc_rights } catch (final MalformedURLException e) { } // now convert the strings to yacyURLs i = h.iterator(); - final HashMap v = new HashMap(); + final Map v = new HashMap(); while (i.hasNext()) { u = (String) i.next(); try { @@ -473,7 +477,7 @@ dc_rights // links is either a Set of Strings (with urls) or // htmlFilterImageEntries // we find all links that are part of a reference inside a url - final HashMap v = new HashMap(); + final Map v = new HashMap(); final Iterator i = links.iterator(); Object o; MultiProtocolURI url; @@ -567,7 +571,7 @@ dc_rights return this.indexingDenied; } - public void writeXML(OutputStreamWriter os, Date date) throws IOException { + public void writeXML(final Writer os, final Date date) throws IOException { os.write("\n"); String title = this.dc_title(); if (title != null && title.length() > 0) os.write("\n"); @@ -593,11 +597,11 @@ dc_rights os.write("\n"); } + @Override public String toString() { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - OutputStreamWriter osw; + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { - osw = new OutputStreamWriter(baos, "UTF-8"); + final Writer osw = new OutputStreamWriter(baos, "UTF-8"); writeXML(osw, new Date()); osw.close(); return new String(baos.toByteArray(), "UTF-8"); @@ -631,7 +635,9 @@ dc_rights * @param docs * @return */ - public static Document mergeDocuments(final MultiProtocolURI location, final String globalMime, Document[] docs) { + public static Document mergeDocuments(final MultiProtocolURI location, + final String globalMime, final Document[] docs) + { if (docs == null || docs.length == 0) return null; if (docs.length == 1) return docs[0]; @@ -646,7 +652,7 @@ dc_rights final Map anchors = new HashMap(); final Map rss = new HashMap(); - final HashMap images = new HashMap(); + final Map images = new HashMap(); for (Document doc: docs) { @@ -706,15 +712,17 @@ dc_rights false); } - public static Map getHyperlinks(Document[] documents) { - Map result = new HashMap(); - for (Document d: documents) result.putAll(d.getHyperlinks()); + public static Map getHyperlinks(final Document[] documents) { + final Map result = new HashMap(); + for (final Document d: documents) { + result.putAll(d.getHyperlinks()); + } return result; } - public static Map getImagelinks(Document[] documents) { - Map result = new HashMap(); - for (Document d: documents) { + public static Map getImagelinks(final Document[] documents) { + final Map result = new HashMap(); + for (final Document d: documents) { for (ImageEntry imageReference : d.getImages().values()) { result.put(imageReference.url(), imageReference.alt()); } diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index f18d51b0b..039a111de 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -2,9 +2,9 @@ // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 15.04.2009 on http://yacy.net // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java index 6578eeea9..ab8fbd7be 100644 --- a/source/net/yacy/document/content/SurrogateReader.java +++ b/source/net/yacy/document/content/SurrogateReader.java @@ -2,9 +2,9 @@ // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 15.04.2009 on http://yacy.net // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/net/yacy/document/content/dao/Dao.java b/source/net/yacy/document/content/dao/Dao.java index 1455f14fc..62e43a1eb 100644 --- a/source/net/yacy/document/content/dao/Dao.java +++ b/source/net/yacy/document/content/dao/Dao.java @@ -2,9 +2,9 @@ // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 25.05.2009 on http://yacy.net // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/net/yacy/document/content/dao/DatabaseConnection.java b/source/net/yacy/document/content/dao/DatabaseConnection.java index 6187d9c43..6cfadca26 100644 --- a/source/net/yacy/document/content/dao/DatabaseConnection.java +++ b/source/net/yacy/document/content/dao/DatabaseConnection.java @@ -2,9 +2,9 @@ // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 11.06.2009 on http://yacy.net // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/net/yacy/document/content/dao/ImportDump.java b/source/net/yacy/document/content/dao/ImportDump.java index 59dc8c5e3..71a4a402a 100644 --- a/source/net/yacy/document/content/dao/ImportDump.java +++ b/source/net/yacy/document/content/dao/ImportDump.java @@ -2,9 +2,9 @@ // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 26.05.2009 on http://yacy.net // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/net/yacy/document/content/dao/PhpBB3Dao.java b/source/net/yacy/document/content/dao/PhpBB3Dao.java index e728ce3da..191e9a52e 100644 --- a/source/net/yacy/document/content/dao/PhpBB3Dao.java +++ b/source/net/yacy/document/content/dao/PhpBB3Dao.java @@ -2,9 +2,9 @@ // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 26.05.2009 on http://yacy.net // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/net/yacy/document/language/Identificator.java b/source/net/yacy/document/language/Identificator.java index 7ee072fdc..3ac9cfd63 100644 --- a/source/net/yacy/document/language/Identificator.java +++ b/source/net/yacy/document/language/Identificator.java @@ -4,9 +4,9 @@ // first published on http://www.yacy.net // Braunschweig, Germany, 2008 // -// $LastChangedDate: 2008-05-23 23:00:00 +0200 (Fr, 23 Mai 2008) $ -// $LastChangedRevision: 4824 $ -// $LastChangedBy: low012 $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by diff --git a/source/net/yacy/document/language/LanguageFilenameFilter.java b/source/net/yacy/document/language/LanguageFilenameFilter.java index 6ded9040b..c6b67da93 100644 --- a/source/net/yacy/document/language/LanguageFilenameFilter.java +++ b/source/net/yacy/document/language/LanguageFilenameFilter.java @@ -4,9 +4,9 @@ // first published on http://www.yacy.net // Braunschweig, Germany, 2008 // -// $LastChangedDate: 2008-05-18 23:00:00 +0200 (Di, 18 Mai 2008) $ -// $LastChangedRevision: 4824 $ -// $LastChangedBy: low012 $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by diff --git a/source/net/yacy/document/language/LanguageStatistics.java b/source/net/yacy/document/language/LanguageStatistics.java index 76ba979d4..70115d87e 100644 --- a/source/net/yacy/document/language/LanguageStatistics.java +++ b/source/net/yacy/document/language/LanguageStatistics.java @@ -4,9 +4,9 @@ // first published on http://www.yacy.net // Braunschweig, Germany, 2008 // -// $LastChangedDate: 2008-05-18 23:00:00 +0200 (Di, 18 Mai 2008) $ -// $LastChangedRevision: 4824 $ -// $LastChangedBy: low012 $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by diff --git a/source/net/yacy/document/language/LanguageStatisticsHolder.java b/source/net/yacy/document/language/LanguageStatisticsHolder.java index 0581e4896..1c384aed4 100644 --- a/source/net/yacy/document/language/LanguageStatisticsHolder.java +++ b/source/net/yacy/document/language/LanguageStatisticsHolder.java @@ -4,9 +4,9 @@ // first published on http://www.yacy.net // Braunschweig, Germany, 2008 // -// $LastChangedDate: 2008-05-23 23:00:00 +0200 (Fr, 23 Mai 2008) $ -// $LastChangedRevision: 4824 $ -// $LastChangedBy: low012 $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by diff --git a/source/net/yacy/document/parser/html/AbstractScraper.java b/source/net/yacy/document/parser/html/AbstractScraper.java index cc55b0b6b..cf51c841d 100644 --- a/source/net/yacy/document/parser/html/AbstractScraper.java +++ b/source/net/yacy/document/parser/html/AbstractScraper.java @@ -29,8 +29,8 @@ package net.yacy.document.parser.html; -import java.util.HashSet; import java.util.Properties; +import java.util.Set; public abstract class AbstractScraper implements Scraper { @@ -38,15 +38,15 @@ public abstract class AbstractScraper implements Scraper { public static final char rb = '>'; public static final char sl = '/'; - private HashSet tags0; - private HashSet tags1; + private Set tags0; + private Set tags1; /** * create a scraper. the tag sets must contain tags in lowercase! * @param tags0 * @param tags1 */ - public AbstractScraper(final HashSet tags0, final HashSet tags1) { + public AbstractScraper(final Set tags0, final Set tags1) { this.tags0 = tags0; this.tags1 = tags1; } @@ -68,11 +68,9 @@ public abstract class AbstractScraper implements Scraper { public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text); protected static String stripAllTags(final char[] s) { - StringBuilder r = new StringBuilder(s.length); + final StringBuilder r = new StringBuilder(s.length); int bc = 0; - char c; - for (int p = 0; p < s.length; p++) { - c = s[p]; + for (final char c : s) { if (c == lb) { bc++; r.append(' '); diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 7ac57b21b..715d16552 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -4,8 +4,6 @@ // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // -// Contains contributions by Marc Nause [MN] -// // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ @@ -41,6 +39,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.Set; import javax.swing.event.EventListenerList; @@ -55,8 +54,8 @@ import net.yacy.kelondro.util.ISO639; public class ContentScraper extends AbstractScraper implements Scraper { // statics: for initialization of the HTMLFilterAbstractScraper - private static final HashSet linkTags0 = new HashSet(9,0.99f); - private static final HashSet linkTags1 = new HashSet(7,0.99f); + private static final Set linkTags0 = new HashSet(9,0.99f); + private static final Set linkTags1 = new HashSet(7,0.99f); // all these tags must be given in lowercase, because the tags from the files are compared in lowercase static { @@ -79,10 +78,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // class variables: collectors for links - private HashMap rss; - private HashMap anchors; - private HashMap images; // urlhash/image relation - private final HashMap metas; + private Map rss; + private Map anchors; + private Map images; // urlhash/image relation + private final Map metas; private String title; //private String headline; private List[] headlines; @@ -153,8 +152,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (b.length() != 0) content.append(b).append(32); } - private static final int find(final String s, final String m, int start) { - int p = s.indexOf(m, start); + private static final int find(final String s, final String m, final int start) { + final int p = s.indexOf(m, start); return (p < 0) ? Integer.MAX_VALUE : p; } @@ -185,14 +184,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { // addImage(images, ie); } } catch (final NumberFormatException e) {} - } - if (tagname.equalsIgnoreCase("base")) try { - root = new MultiProtocolURI(tagopts.getProperty("href", "")); - } catch (final MalformedURLException e) {} - if (tagname.equalsIgnoreCase("frame")) { + } else if(tagname.equalsIgnoreCase("base")) { + try { + root = new MultiProtocolURI(tagopts.getProperty("href", "")); + } catch (final MalformedURLException e) {} + } else if (tagname.equalsIgnoreCase("frame")) { anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name","")); - } - if (tagname.equalsIgnoreCase("meta")) { + } else if (tagname.equalsIgnoreCase("meta")) { String name = tagopts.getProperty("name", ""); if (name.length() > 0) { metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content",""))); @@ -202,14 +200,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { metas.put(name.toLowerCase(), CharacterCoding.html2unicode(tagopts.getProperty("content",""))); } } - } - if (tagname.equalsIgnoreCase("area")) { + } else if (tagname.equalsIgnoreCase("area")) { final String areatitle = cleanLine(tagopts.getProperty("title","")); //String alt = tagopts.getProperty("alt",""); final String href = tagopts.getProperty("href", ""); if (href.length() > 0) anchors.put(absolutePath(href), areatitle); - } - if (tagname.equalsIgnoreCase("link")) { + } else if (tagname.equalsIgnoreCase("link")) { final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", "")); if (newLink != null) { @@ -227,18 +223,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { anchors.put(newLink, linktitle); } } - } - //start contrib [MN] - if (tagname.equalsIgnoreCase("embed")) { + } else if(tagname.equalsIgnoreCase("embed")) { anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name","")); - } - if (tagname.equalsIgnoreCase("param")) { + } else if(tagname.equalsIgnoreCase("param")) { final String name = tagopts.getProperty("name", ""); if (name.equalsIgnoreCase("movie")) { anchors.put(absolutePath(tagopts.getProperty("value", "")),name); } } - //end contrib [MN] // fire event fireScrapeTag0(tagname, tagopts); @@ -262,24 +254,20 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } } - String h; + final String h; if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { h = recursiveParse(text); if (h.length() > 0) headlines[0].add(h); - } - if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) { + } else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) { h = recursiveParse(text); if (h.length() > 0) headlines[1].add(h); - } - if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) { + } else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) { h = recursiveParse(text); if (h.length() > 0) headlines[2].add(h); - } - if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) { + } else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) { h = recursiveParse(text); if (h.length() > 0) headlines[3].add(h); - } - if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) { + } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) { title = recursiveParse(text); } @@ -287,7 +275,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { fireScrapeTag1(tagname, tagopts, text); } - private String recursiveParse(char[] inlineHtml) { + private String recursiveParse(final char[] inlineHtml) { if (inlineHtml.length < 14) return cleanLine(super.stripAll(inlineHtml)); // start a new scraper to parse links inside this text @@ -307,11 +295,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { return cleanLine(super.stripAll(scraper.content.getChars())); } - private final static String cleanLine(String s) { - StringBuilder sb = new StringBuilder(s.length()); - char c, l = ' '; - for (int i = 0; i < s.length(); i++) { - c = s.charAt(i); + private final static String cleanLine(final String s) { + final StringBuilder sb = new StringBuilder(s.length()); + char l = ' '; + for (char c : s.toCharArray()) { if (c < ' ') c = ' '; if (c == ' ') { if (l != ' ') sb.append(c); @@ -358,9 +345,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { public String[] getHeadlines(final int i) { assert ((i >= 1) && (i <= 4)); - final String[] s = new String[headlines[i - 1].size()]; - for (int j = 0; j < headlines[i - 1].size(); j++) s[j] = headlines[i - 1].get(j); - return s; + return headlines[i - 1].toArray(new String[headlines.length]); } public byte[] getText() { @@ -389,7 +374,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { * get all images * @return a map of */ - public HashMap getImages() { + public Map getImages() { // this resturns a String(absolute url)/htmlFilterImageEntry - relation return images; } @@ -448,13 +433,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { return s; } - public HashSet getContentLanguages() { + public Set getContentLanguages() { // i.e. // or String s = metas.get("content-language"); if (s == null) s = metas.get("dc.language"); if (s == null) return null; - HashSet hs = new HashSet(); + Set hs = new HashSet(); String[] cl = s.split(" |,"); int p; for (int i = 0; i < cl.length; i++) { @@ -579,7 +564,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { return scraper; } - public static void addAllImages(final HashMap a, final HashMap b) { + public static void addAllImages(final Map a, final Map b) { final Iterator> i = b.entrySet().iterator(); Map.Entry ie; while (i.hasNext()) { @@ -588,7 +573,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } - public static void addImage(final HashMap a, final ImageEntry ie) { + public static void addImage(final Map a, final ImageEntry ie) { if (a.containsKey(ie.url())) { // in case of a collision, take that image that has the better image size tags if ((ie.height() > 0) && (ie.width() > 0)) a.put(ie.url(), ie); diff --git a/source/net/yacy/document/parser/images/bmpParser.java b/source/net/yacy/document/parser/images/bmpParser.java index b98d04cfb..57a9e63d2 100644 --- a/source/net/yacy/document/parser/images/bmpParser.java +++ b/source/net/yacy/document/parser/images/bmpParser.java @@ -2,9 +2,9 @@ // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 15.07.2007 on http://yacy.net // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index c4116d6d6..d41910cfb 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -4,9 +4,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate: 2009-10-11 02:12:19 +0200 (So, 11 Okt 2009) $ -// $LastChangedRevision: 6398 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/net/yacy/document/parser/images/icoParser.java b/source/net/yacy/document/parser/images/icoParser.java index 65bf7337d..27d7b79c1 100644 --- a/source/net/yacy/document/parser/images/icoParser.java +++ b/source/net/yacy/document/parser/images/icoParser.java @@ -2,9 +2,9 @@ // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 15.07.2007 on http://yacy.net // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE //