diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 59814fda3..6c702f241 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -70,8 +70,8 @@ import de.anomic.yacy.yacyURL; public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { // statics: for initialisation of the HTMLFilterAbstractScraper - private static TreeSet linkTags0; - private static TreeSet linkTags1; + private static TreeSet linkTags0; + private static TreeSet linkTags1; private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); static { @@ -80,7 +80,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } static { - linkTags0 = new TreeSet(insensitiveCollator); + linkTags0 = new TreeSet(insensitiveCollator); linkTags0.add("img"); linkTags0.add("base"); linkTags0.add("frame"); @@ -90,7 +90,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen linkTags0.add("embed"); //added by [MN] linkTags0.add("param"); //added by [MN] - linkTags1 = new TreeSet(insensitiveCollator); + linkTags1 = new TreeSet(insensitiveCollator); linkTags1.add("a"); linkTags1.add("h1"); linkTags1.add("h2"); @@ -100,9 +100,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } // class variables: collectors for links - private HashMap anchors; - private TreeSet images; // String(absolute url)/ImageEntry relation - private HashMap metas; + private HashMap anchors; + private TreeSet images; // String(absolute url)/ImageEntry relation + private HashMap metas; private String title; //private String headline; private List[] headlines; @@ -124,12 +124,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen // it is only the reference for relative links super(linkTags0, linkTags1); this.root = root; - this.anchors = new HashMap(); - this.images = new TreeSet(); - this.metas = new HashMap(); + this.anchors = new HashMap(); + this.images = new TreeSet(); + this.metas = new HashMap(); this.title = ""; this.headlines = new ArrayList[4]; - for (int i = 0; i < 4; i++) headlines[i] = new ArrayList(); + for (int i = 0; i < 4; i++) headlines[i] = new ArrayList(); this.content = new serverCharBuffer(1024); } @@ -311,7 +311,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } } - // othervise take any headline + // otherwise take any headline for (int i = 0; i < 4; i++) { if (headlines[i].size() > 0) return (String) headlines[i].get(0); } @@ -346,17 +346,17 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } } - public Map getAnchors() { + public Map getAnchors() { // returns a url (String) / name (String) relation return anchors; } - public TreeSet getImages() { + public TreeSet getImages() { // this resturns a String(absolute url)/htmlFilterImageEntry - relation return images; } - public Map getMetas() { + public Map getMetas() { return metas; } diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java index 77bed5a05..747d2ad64 100644 --- a/source/de/anomic/plasma/parser/bzip/bzipParser.java +++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java @@ -63,7 +63,7 @@ public class bzipParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static String fileExtensions = "bz2,tbz,tbz2"; static { SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions); diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index d25e4abcb..a3ef93d30 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -60,7 +60,7 @@ public class docParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/msword","doc"); } /** diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java index c057377f8..085e95598 100644 --- a/source/de/anomic/plasma/parser/gzip/gzipParser.java +++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java @@ -62,7 +62,7 @@ public class gzipParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/x-gzip","gz,tgz"); SUPPORTED_MIME_TYPES.put("application/gzip","gz,tgz"); diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java index 02fda681b..4b4417f24 100644 --- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java +++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java @@ -70,7 +70,7 @@ public class mimeTypeParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("text/xml","xml"); SUPPORTED_MIME_TYPES.put("application/xml","xml"); @@ -96,7 +96,7 @@ public class mimeTypeParser extends AbstractParser implements Parser { * Helping structure used to detect loops in the mimeType detection * process */ - private static Hashtable threadLoopDetection = new Hashtable(); + private static Hashtable threadLoopDetection = new Hashtable(); public mimeTypeParser() { super(LIBX_DEPENDENCIES); diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index 262e29760..ad8606df6 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -74,7 +74,7 @@ public class odtParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt"); SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt"); diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 3609f98c0..98c212e14 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -70,7 +70,7 @@ public class pdfParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/pdf","pdf"); } /** diff --git a/source/de/anomic/plasma/parser/ppt/pptParser.java b/source/de/anomic/plasma/parser/ppt/pptParser.java index bd276028f..1df0fd873 100644 --- a/source/de/anomic/plasma/parser/ppt/pptParser.java +++ b/source/de/anomic/plasma/parser/ppt/pptParser.java @@ -62,7 +62,7 @@ public class pptParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/mspowerpoint","ppt,pps"); SUPPORTED_MIME_TYPES.put("application/powerpoint","ppt,pps"); diff --git a/source/de/anomic/plasma/parser/ps/psParser.java b/source/de/anomic/plasma/parser/ps/psParser.java index 3d7900647..0ecbf3ce4 100644 --- a/source/de/anomic/plasma/parser/ps/psParser.java +++ b/source/de/anomic/plasma/parser/ps/psParser.java @@ -65,7 +65,7 @@ public class psParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/postscript","ps"); SUPPORTED_MIME_TYPES.put("text/postscript","ps"); diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index 1296eb689..b33f65012 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -70,7 +70,7 @@ public class rpmParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/x-rpm","rpm"); SUPPORTED_MIME_TYPES.put("application/x-redhat packet manager","rpm"); @@ -110,7 +110,7 @@ public class rpmParser extends AbstractParser implements Parser { RPMFile rpmFile = null; try { String summary = null, description = null, packager = null, name = sourceFile.getName(); - HashMap anchors = new HashMap(); + HashMap anchors = new HashMap(); StringBuffer content = new StringBuffer(); // opening the rpm file diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index 4ee94ea4f..c8595b3ca 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -73,7 +73,7 @@ public class rssParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("text/rss","xml,rss,rdf"); SUPPORTED_MIME_TYPES.put("application/rdf+xml","xml,rss,rdf"); @@ -95,9 +95,9 @@ public class rssParser extends AbstractParser implements Parser { public plasmaParserDocument parse(yacyURL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException { try { - LinkedList feedSections = new LinkedList(); - HashMap anchors = new HashMap(); - TreeSet images = new TreeSet(); + LinkedList feedSections = new LinkedList(); + HashMap anchors = new HashMap(); + TreeSet images = new TreeSet(); serverByteBuffer text = new serverByteBuffer(); serverCharBuffer authors = new serverCharBuffer(); @@ -149,12 +149,12 @@ public class rssParser extends AbstractParser implements Parser { feedSections.add(itemHeadline); } - Map itemLinks = scraper.getAnchors(); + Map itemLinks = scraper.getAnchors(); if ((itemLinks != null) && (itemLinks.size() > 0)) { anchors.putAll(itemLinks); } - TreeSet itemImages = scraper.getImages(); + TreeSet itemImages = scraper.getImages(); if ((itemImages != null) && (itemImages.size() > 0)) { images.addAll(itemImages); } diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java index 133bbe20b..ed8a1128f 100644 --- a/source/de/anomic/plasma/parser/rtf/rtfParser.java +++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java @@ -61,7 +61,7 @@ public class rtfParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/rtf","rtf"); SUPPORTED_MIME_TYPES.put("text/rtf","rtf"); diff --git a/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java index cea2a6066..2a254a7fe 100644 --- a/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java +++ b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java @@ -93,7 +93,7 @@ public class SZParserExtractCallback extends ArchiveExtractCallback { case IInArchive.NExtract_NAskMode_kSkip: this.log.logFine("Skipping " + this.filePath); break; - }; + } } public void SetOperationResult(int arg0) throws IOException { @@ -126,7 +126,7 @@ public class SZParserExtractCallback extends ArchiveExtractCallback { } // revert the above workaround - Map nanchors = new HashMap(theDoc.getAnchors().size(), 1f); + Map nanchors = new HashMap(theDoc.getAnchors().size(), 1f); Iterator it = theDoc.getAnchors().entrySet().iterator(); Map.Entry entry; String base = doc.getLocation().toNormalform(false, true); @@ -135,9 +135,9 @@ public class SZParserExtractCallback extends ArchiveExtractCallback { if (((String)entry.getKey()).startsWith(base + "/")) { String ref = "#" + ((String)entry.getKey()).substring(base.length() + 1); this.log.logFinest("changing " + entry.getKey() + " to use reference " + ref); - nanchors.put(base + ref, entry.getValue()); + nanchors.put(base + ref, (String)entry.getValue()); } else { - nanchors.put(entry.getKey(), entry.getValue()); + nanchors.put((String)entry.getKey(), (String)entry.getValue()); } } theDoc.getAnchors().clear(); diff --git a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java index 0c15fa30f..69d60e6da 100644 --- a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java +++ b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java @@ -65,7 +65,7 @@ public class sevenzipParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z"); } diff --git a/source/de/anomic/plasma/parser/swf/swfParser.java b/source/de/anomic/plasma/parser/swf/swfParser.java index 9e47ff7db..32645f946 100644 --- a/source/de/anomic/plasma/parser/swf/swfParser.java +++ b/source/de/anomic/plasma/parser/swf/swfParser.java @@ -61,7 +61,7 @@ public class swfParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash","swf"); SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash2-preview","swf"); @@ -101,7 +101,7 @@ public class swfParser extends AbstractParser implements Parser { String[] sections = null; String abstrct = null; //TreeSet images = null; - HashMap anchors = new HashMap(); + HashMap anchors = new HashMap(); int urls = 0; int urlStart = -1; int urlEnd = 0; diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java index 40cdc345c..08df662a6 100644 --- a/source/de/anomic/plasma/parser/tar/tarParser.java +++ b/source/de/anomic/plasma/parser/tar/tarParser.java @@ -59,6 +59,7 @@ import java.util.zip.GZIPInputStream; import com.ice.tar.TarEntry; import com.ice.tar.TarInputStream; +import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.parser.AbstractParser; @@ -74,7 +75,7 @@ public class tarParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/x-tar","tar"); SUPPORTED_MIME_TYPES.put("application/tar","tar"); @@ -127,11 +128,11 @@ public class tarParser extends AbstractParser implements Parser { StringBuffer docKeywords = new StringBuffer(); StringBuffer docLongTitle = new StringBuffer(); - LinkedList docSections = new LinkedList(); + LinkedList docSections = new LinkedList(); StringBuffer docAbstrct = new StringBuffer(); - Map docAnchors = new HashMap(); - TreeSet docImages = new TreeSet(); + Map docAnchors = new HashMap(); + TreeSet docImages = new TreeSet(); // looping through the contained files TarEntry entry; diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index 53f8ba869..e14004c52 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -76,7 +76,7 @@ public class vcfParser extends AbstractParser implements Parser { * * TODO: support of x-mozilla-cpt and x-mozilla-html tags */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("text/x-vcard","vcf"); SUPPORTED_MIME_TYPES.put("application/vcard","vcf"); @@ -102,9 +102,9 @@ public class vcfParser extends AbstractParser implements Parser { try { StringBuffer parsedTitle = new StringBuffer(); StringBuffer parsedDataText = new StringBuffer(); - HashMap parsedData = new HashMap(); - HashMap anchors = new HashMap(); - LinkedList parsedNames = new LinkedList(); + HashMap parsedData = new HashMap(); + HashMap anchors = new HashMap(); + LinkedList parsedNames = new LinkedList(); boolean useLastLine = false; int lineNr = 0; diff --git a/source/de/anomic/plasma/parser/xls/xlsParser.java b/source/de/anomic/plasma/parser/xls/xlsParser.java index 17780429f..010b1d2bd 100644 --- a/source/de/anomic/plasma/parser/xls/xlsParser.java +++ b/source/de/anomic/plasma/parser/xls/xlsParser.java @@ -74,7 +74,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/msexcel","xls"); SUPPORTED_MIME_TYPES.put("application/excel","xls"); diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java index 8cbf1678f..d3fcfb798 100644 --- a/source/de/anomic/plasma/parser/zip/zipParser.java +++ b/source/de/anomic/plasma/parser/zip/zipParser.java @@ -43,6 +43,7 @@ package de.anomic.plasma.parser.zip; +import de.anomic.htmlFilter.htmlFilterImageEntry; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; @@ -72,7 +73,7 @@ public class zipParser extends AbstractParser implements Parser { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); static { SUPPORTED_MIME_TYPES.put("application/zip","zip"); SUPPORTED_MIME_TYPES.put("application/x-zip","zip"); @@ -91,7 +92,7 @@ public class zipParser extends AbstractParser implements Parser { this.parserName = "Compressed Archive File Parser"; } - public Hashtable getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @@ -111,10 +112,10 @@ public class zipParser extends AbstractParser implements Parser { StringBuffer docKeywords = new StringBuffer(); StringBuffer docLongTitle = new StringBuffer(); - LinkedList docSections = new LinkedList(); + LinkedList docSections = new LinkedList(); StringBuffer docAbstrct = new StringBuffer(); - Map docAnchors = new HashMap(); - TreeSet docImages = new TreeSet(); + Map docAnchors = new HashMap(); + TreeSet docImages = new TreeSet(); // creating a new parser class to parse the unzipped content plasmaParser theParser = new plasmaParser(); diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 8aa48d51d..05e58d872 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -66,7 +66,7 @@ public final class plasmaParser { public static final String PARSER_MODE_URLREDIRECTOR = "URLREDIRECTOR"; public static final String PARSER_MODE_ICAP = "ICAP"; public static final String PARSER_MODE_IMAGE = "IMAGE"; - public static final HashSet PARSER_MODE = new HashSet(Arrays.asList(new String[]{ + public static final HashSet PARSER_MODE = new HashSet(Arrays.asList(new String[]{ PARSER_MODE_PROXY, PARSER_MODE_CRAWLER, PARSER_MODE_ICAP, @@ -74,7 +74,7 @@ public final class plasmaParser { PARSER_MODE_IMAGE })); - private static final HashMap parserConfigList = new HashMap(); + private static final HashMap parserConfigList = new HashMap(); /** * A list containing all installed parsers and the mimeType that they support @@ -85,8 +85,8 @@ public final class plasmaParser { /** * A list of file extensions and mime types that are supported by the html-parser */ - public static final HashSet supportedHTMLFileExt = new HashSet(); - public static final HashSet supportedHTMLMimeTypes = new HashSet(); + public static final HashSet supportedHTMLFileExt = new HashSet(); + public static final HashSet supportedHTMLMimeTypes = new HashSet(); private static final Properties mimeTypeLookupByFileExt = new Properties(); static { @@ -104,15 +104,15 @@ public final class plasmaParser { /** * A list of media extensions that should not be handled by the plasmaParser */ - private static final HashSet mediaExtSet = new HashSet(); + private static final HashSet mediaExtSet = new HashSet(); /** * A list of image, audio, video and application extensions */ - private static final HashSet imageExtSet = new HashSet(); - private static final HashSet audioExtSet = new HashSet(); - private static final HashSet videoExtSet = new HashSet(); - private static final HashSet appsExtSet = new HashSet(); + private static final HashSet imageExtSet = new HashSet(); + private static final HashSet audioExtSet = new HashSet(); + private static final HashSet videoExtSet = new HashSet(); + private static final HashSet appsExtSet = new HashSet(); /** * This {@link FilenameFilter} is used to find all classes based on there filenames @@ -181,7 +181,7 @@ public final class plasmaParser { * yacy html parser */ public static void initHTMLParsableMimeTypes(String htmlParsableMimeTypes) { - LinkedList mimeTypes = new LinkedList(); + LinkedList mimeTypes = new LinkedList(); if ((htmlParsableMimeTypes == null) || (htmlParsableMimeTypes.length() == 0)) { return; } @@ -195,8 +195,8 @@ public final class plasmaParser { } } - public static List extString2extList(String extString) { - LinkedList extensions = new LinkedList(); + public static List extString2extList(String extString) { + LinkedList extensions = new LinkedList(); if ((extString == null) || (extString.length() == 0)) { return extensions; } else { @@ -206,35 +206,35 @@ public final class plasmaParser { return extensions; } - public static void initMediaExt(List mediaExtList) { + public static void initMediaExt(List mediaExtList) { synchronized (mediaExtSet) { mediaExtSet.clear(); mediaExtSet.addAll(mediaExtList); } } - public static void initImageExt(List imageExtList) { + public static void initImageExt(List imageExtList) { synchronized (imageExtSet) { imageExtSet.clear(); imageExtSet.addAll(imageExtList); } } - public static void initAudioExt(List audioExtList) { + public static void initAudioExt(List audioExtList) { synchronized (audioExtSet) { audioExtSet.clear(); audioExtSet.addAll(audioExtList); } } - public static void initVideoExt(List videoExtList) { + public static void initVideoExt(List videoExtList) { synchronized (videoExtSet) { videoExtSet.clear(); videoExtSet.addAll(videoExtList); } } - public static void initAppsExt(List appsExtList) { + public static void initAppsExt(List appsExtList) { synchronized (appsExtSet) { appsExtSet.clear(); appsExtSet.addAll(appsExtList); @@ -247,7 +247,7 @@ public final class plasmaParser { } } - public static void initSupportedHTMLFileExt(List supportedRealtimeFileExtList) { + public static void initSupportedHTMLFileExt(List supportedRealtimeFileExtList) { synchronized (supportedHTMLFileExt) { supportedHTMLFileExt.clear(); supportedHTMLFileExt.addAll(supportedRealtimeFileExtList); @@ -750,10 +750,10 @@ public final class plasmaParser { } - static Map allReflinks(Set links) { + static Map allReflinks(Set links) { // links is either a Set of Strings (with urls) or htmlFilterImageEntries // we find all links that are part of a reference inside a url - HashMap v = new HashMap(); + HashMap v = new HashMap(); Iterator i = links.iterator(); Object o; String url; @@ -784,9 +784,9 @@ public final class plasmaParser { return v; } - static Map allSubpaths(Set links) { + static Map allSubpaths(Set links) { // links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries - HashMap v = new HashMap(); + HashMap v = new HashMap(); Iterator i = links.iterator(); Object o; String url; diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index e709cb768..ca1360410 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -68,21 +68,21 @@ import de.anomic.plasma.parser.Parser; public class plasmaParserDocument { private yacyURL location; // the source url - private String mimeType; // mimeType as taken from http header - private String charset; // the charset of the document - private List keywords; // most resources provide a keyword field - private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result - private StringBuffer author; // author or copyright - private List sections; // if present: more titles/headlines appearing in the document - private StringBuffer abstrct; // an abstract, if present: short content description - private Object text; // the clear text, all that is visible - private Map anchors; // all links embedded as clickeable entities (anchor tags) - private TreeSet images; // all visible pictures in document + private String mimeType; // mimeType as taken from http header + private String charset; // the charset of the document + private List keywords; // most resources provide a keyword field + private StringBuffer title; // a document title, taken from title or h1 tag; shall appear as headline of search result + private StringBuffer author; // author or copyright + private List sections; // if present: more titles/headlines appearing in the document + private StringBuffer abstrct; // an abstract, if present: short content description + private Object text; // the clear text, all that is visible + private Map anchors; // all links embedded as clickeable entities (anchor tags) + private TreeSet images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative // text in image tags. - private Map hyperlinks, audiolinks, videolinks, applinks; - private Map emaillinks; + private Map hyperlinks, audiolinks, videolinks, applinks; + private Map emaillinks; private yacyURL favicon; private boolean resorted; private InputStream textStream; @@ -90,17 +90,17 @@ public class plasmaParserDocument { protected plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - Object text, Map anchors, TreeSet images) { + Object text, Map anchors, TreeSet images) { this.location = location; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; this.charset = charset; - this.keywords = (keywords == null) ? new LinkedList() : Arrays.asList(keywords); + this.keywords = (keywords == null) ? new LinkedList() : Arrays.asList(keywords); this.title = (title == null) ? new StringBuffer() : new StringBuffer(title); this.author = (author == null) ? new StringBuffer() : new StringBuffer(author); - this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections); + this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections); this.abstrct = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct); - this.anchors = (anchors == null) ? new HashMap(0) : anchors; - this.images = (images == null) ? new TreeSet() : images; + this.anchors = (anchors == null) ? new HashMap(0) : anchors; + this.images = (images == null) ? new TreeSet() : images; this.hyperlinks = null; this.audiolinks = null; this.videolinks = null; @@ -125,21 +125,21 @@ public class plasmaParserDocument { public plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - byte[] text, Map anchors, TreeSet images) { + byte[] text, Map anchors, TreeSet images) { this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } public plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - File text, Map anchors, TreeSet images) { + File text, Map anchors, TreeSet images) { this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } public plasmaParserDocument(yacyURL location, String mimeType, String charset, String[] keywords, String title, String author, String[] sections, String abstrct, - serverCachedFileOutputStream text, Map anchors, TreeSet images) { + serverCachedFileOutputStream text, Map anchors, TreeSet images) { this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images); } @@ -238,7 +238,7 @@ public class plasmaParserDocument { public String getKeywords(char separator) { // sort out doubles and empty words - TreeSet hs = new TreeSet(); + TreeSet hs = new TreeSet(); String s; for (int i = 0; i < this.keywords.size(); i++) { if (this.keywords.get(i) == null) continue; @@ -253,11 +253,11 @@ public class plasmaParserDocument { return sb.substring(0, sb.length() - 1); } - public List getKeywords() { + public List getKeywords() { return this.keywords; } - public Map getAnchors() { + public Map getAnchors() { // returns all links embedded as anchors (clickeable entities) // this is a url(String)/text(String) map return anchors; @@ -266,35 +266,35 @@ public class plasmaParserDocument { // the next three methods provide a calculated view on the getAnchors/getImages: - public Map getHyperlinks() { + public Map getHyperlinks() { // this is a subset of the getAnchor-set: only links to other hyperrefs if (!resorted) resortLinks(); return hyperlinks; } - public Map getAudiolinks() { + public Map getAudiolinks() { if (!resorted) resortLinks(); return this.audiolinks; } - public Map getVideolinks() { + public Map getVideolinks() { if (!resorted) resortLinks(); return this.videolinks; } - public TreeSet getImages() { + public TreeSet getImages() { // returns all links enbedded as pictures (visible in document) // this resturns a htmlFilterImageEntry collection if (!resorted) resortLinks(); return images; } - public Map getApplinks() { + public Map getApplinks() { if (!resorted) resortLinks(); return this.applinks; } - public Map getEmaillinks() { + public Map getEmaillinks() { // this is part of the getAnchor-set: only links to email addresses if (!resorted) resortLinks(); return emaillinks; @@ -309,18 +309,18 @@ public class plasmaParserDocument { int extpos, qpos; String ext = null; i = anchors.entrySet().iterator(); - hyperlinks = new HashMap(); - videolinks = new HashMap(); - audiolinks = new HashMap(); - applinks = new HashMap(); - emaillinks = new HashMap(); - TreeSet collectedImages = new TreeSet(); // this is a set that is collected now and joined later to the imagelinks + hyperlinks = new HashMap(); + videolinks = new HashMap(); + audiolinks = new HashMap(); + applinks = new HashMap(); + emaillinks = new HashMap(); + TreeSet collectedImages = new TreeSet(); // this is a set that is collected now and joined later to the imagelinks Map.Entry entry; while (i.hasNext()) { entry = (Map.Entry) i.next(); u = (String) entry.getKey(); if ((u != null) && (u.startsWith("mailto:"))) { - emaillinks.put(u.substring(7), entry.getValue()); + emaillinks.put(u.substring(7), (String)entry.getValue()); } else { extpos = u.lastIndexOf("."); if (extpos > 0) { @@ -337,11 +337,11 @@ public class plasmaParserDocument { if (plasmaParser.imageExtContains(ext)) { collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1)); } - else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, entry.getValue()); - else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, entry.getValue()); - else if (plasmaParser.appsExtContains(ext)) applinks.put(u, entry.getValue()); + else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, (String)entry.getValue()); + else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, (String)entry.getValue()); + else if (plasmaParser.appsExtContains(ext)) applinks.put(u, (String)entry.getValue()); } else { - hyperlinks.put(u, entry.getValue()); + hyperlinks.put(u, (String)entry.getValue()); } } catch (MalformedURLException e1) { } @@ -356,11 +356,11 @@ public class plasmaParserDocument { iEntry = (htmlFilterImageEntry) i.next(); if (!images.contains(iEntry)) images.add(iEntry); } - + // expand the hyperlinks: // we add artificial hyperlinks to the hyperlink set // that can be calculated from given hyperlinks and imagelinks - hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks.keySet())); + hyperlinks.putAll(plasmaParser.allReflinks(images)); hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet())); hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet()));