From c0274bd1233871c987a544c6fd4c5346141c665c Mon Sep 17 00:00:00 2001 From: low012 Date: Mon, 27 Dec 2010 15:37:11 +0000 Subject: [PATCH] *) minor changes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7394 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../net/yacy/document/parser/bzipParser.java | 6 +- .../net/yacy/document/parser/csvParser.java | 49 ++++++++++------- .../net/yacy/document/parser/docParser.java | 30 +++++----- .../yacy/document/parser/genericParser.java | 14 ++++- .../net/yacy/document/parser/htmlParser.java | 20 ++++--- .../net/yacy/document/parser/odtParser.java | 6 +- .../net/yacy/document/parser/ooxmlParser.java | 10 ++-- .../net/yacy/document/parser/pdfParser.java | 14 +++-- source/net/yacy/document/parser/psParser.java | 50 +++++++++-------- .../net/yacy/document/parser/rssParser.java | 16 ++++-- .../net/yacy/document/parser/rtfParser.java | 20 +++---- .../yacy/document/parser/sevenzipParser.java | 2 +- .../yacy/document/parser/sitemapParser.java | 55 ++++++++++++------- .../net/yacy/document/parser/swfParser.java | 3 +- .../net/yacy/document/parser/tarParser.java | 8 ++- .../yacy/document/parser/torrentParser.java | 39 +++++++------ .../net/yacy/document/parser/vcfParser.java | 45 +++++++-------- .../net/yacy/document/parser/vsdParser.java | 11 ++-- .../net/yacy/document/parser/zipParser.java | 14 +++-- 19 files changed, 245 insertions(+), 167 deletions(-) diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 271f80691..7e010ff33 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -41,7 +41,7 @@ import net.yacy.kelondro.util.FileUtils; import org.apache.tools.bzip2.CBZip2InputStream; -public class bzipParser extends AbstractParser implements Parser { +public class bzipParser extends AbstractParser implements Parser { public bzipParser() { super("Bzip 2 UNIX Compressed File Parser"); @@ -55,7 +55,9 @@ public class bzipParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.add("application/x-stuffit"); } - public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI location, final String mimeType, + final String charset, final InputStream source) + throws Parser.Failure, InterruptedException { File tempFile = null; Document[] docs; diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java index 521243e63..03c6f07b4 100644 --- a/source/net/yacy/document/parser/csvParser.java +++ b/source/net/yacy/document/parser/csvParser.java @@ -3,6 +3,10 @@ * Copyright 2009 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 02.10.2009 at http://yacy.net * +// $LastChangedDate $ +// $LastChangedRevision $ +// $LastChangedBy $ + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either @@ -49,10 +53,12 @@ public class csvParser extends AbstractParser implements Parser { // construct a document using all cells of the document // the first row is used as headline // all lines are artificially terminated by a '.' to separate them as sentence for the condenser. - List table = getTable(location, mimeType, charset, source); + final List table = getTable(location, mimeType, charset, source); if (table.isEmpty()) throw new Parser.Failure("document has no lines", location); - StringBuilder sb = new StringBuilder(); - for (String[] row: table) sb.append(concatRow(row)).append(' '); + final StringBuilder sb = new StringBuilder(); + for (final String[] row: table) { + sb.append(concatRow(row)).append(' '); + } try { return new Document[]{new Document( location, @@ -75,18 +81,18 @@ public class csvParser extends AbstractParser implements Parser { } } - public String concatRow(String[] column) { - StringBuilder sb = new StringBuilder(80); - for (int i = 0; i < column.length; i++) { - if (i != 0) sb.append(' '); - sb.append(column[i]); + private String concatRow(String[] columns) { + final StringBuilder sb = new StringBuilder(80); + for (final String column : columns) { + if (sb.length() > 0) sb.append(' '); + sb.append(column); } sb.append('.'); return sb.toString(); } - public List getTable(MultiProtocolURI location, String mimeType, String charset, InputStream source) { - ArrayList rows = new ArrayList(); + private List getTable(MultiProtocolURI location, String mimeType, String charset, InputStream source) { + final List rows = new ArrayList(); BufferedReader reader; try { reader = new BufferedReader(new InputStreamReader(source, charset)); @@ -102,16 +108,16 @@ public class csvParser extends AbstractParser implements Parser { if (row.length() == 0) continue; if (separator == null) { // try comma, semicolon and tab; take that one that results with more columns - String[] colc = row.split(","); - String[] cols = row.split(";"); - String[] colt = row.split("\t"); + final String[] colc = row.split(","); + final String[] cols = row.split(";"); + final String[] colt = row.split("\t"); if (colc.length >= cols.length && colc.length >= colt.length) separator = ","; if (cols.length >= colc.length && cols.length >= colt.length) separator = ";"; if (colt.length >= cols.length && colt.length >= colc.length) separator = "\t"; } row = stripQuotes(row, '\"', separator.charAt(0), ' '); row = stripQuotes(row, '\'', separator.charAt(0), ' '); - String[] cols = row.split(separator); + final String[] cols = row.split(separator); if (columns == -1) columns = cols.length; //if (cols.length != columns) continue; // skip lines that have the wrong number of columns rows.add(cols); @@ -130,19 +136,22 @@ public class csvParser extends AbstractParser implements Parser { * @param replacement * @return the line without the quotes */ - public static String stripQuotes(String line, char quote, char separator, char replacement) { + private static String stripQuotes(final String line, final char quote, + final char separator, final char replacement) { + String ret = line; + int p, q; // find left quote - while ((p = line.indexOf(quote)) >= 0) { - q = line.indexOf(quote, p + 1); + while ((p = ret.indexOf(quote)) >= 0) { + q = ret.indexOf(quote, p + 1); if (q < 0) { // there is only a single quote but no 'right' quote. // This data is not well-formed. Just remove the quote and give up. - return line.substring(0, p) + line.substring(p + 1); + return ret.substring(0, p) + ret.substring(p + 1); } - line = line.substring(0, p) + line.substring(p + 1, q).replace(separator, replacement) + line.substring(q + 1); + ret = ret.substring(0, p) + ret.substring(p + 1, q).replace(separator, replacement) + ret.substring(q + 1); } - return line; + return ret; } } diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 49fcd55e3..3efe68f60 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -38,8 +38,8 @@ import org.apache.poi.hwpf.extractor.WordExtractor; public class docParser extends AbstractParser implements Parser { - public docParser() { - super("Word Document Parser"); + public docParser() { + super("Word Document Parser"); SUPPORTED_EXTENSIONS.add("doc"); SUPPORTED_MIME_TYPES.add("application/msword"); SUPPORTED_MIME_TYPES.add("application/doc"); @@ -50,9 +50,11 @@ public class docParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.add("application/word"); SUPPORTED_MIME_TYPES.add("application/x-msw6"); SUPPORTED_MIME_TYPES.add("application/x-msword"); - } + } - public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI location, final String mimeType, + final String charset, final InputStream source) + throws Parser.Failure, InterruptedException { final WordExtractor extractor; @@ -62,7 +64,7 @@ public class docParser extends AbstractParser implements Parser { throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location); } - StringBuilder contents = new StringBuilder(); + final StringBuilder contents = new StringBuilder(); try { contents.append(extractor.getText().trim()); contents.append(" "); @@ -72,15 +74,15 @@ public class docParser extends AbstractParser implements Parser { } catch (Exception e) { throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location); } - String title = (contents.length() > 240) ? contents.substring(0,240) : contents.toString().trim(); + String title = (contents.length() > 240) ? contents.substring(0,240) : contents.toString().trim(); title.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim(); - if (title.length() > 80) title = title.substring(0, 80); - int l = title.length(); - while (true) { - title = title.replaceAll(" ", " "); - if (title.length() == l) break; - l = title.length(); - } + if (title.length() > 80) title = title.substring(0, 80); + int l = title.length(); + while (true) { + title = title.replaceAll(" ", " "); + if (title.length() == l) break; + l = title.length(); + } Document[] docs; try { @@ -105,6 +107,6 @@ public class docParser extends AbstractParser implements Parser { } return docs; - } + } } diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index 7559d7c99..01df6a3d7 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -3,6 +3,10 @@ * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany * First released 30.11.2010 at http://yacy.net * +// $LastChangedDate $ +// $LastChangedRevision $ +// $LastChangedBy $ + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either @@ -38,9 +42,11 @@ public class genericParser extends AbstractParser implements Parser { // this parser is used if no other fits. This parser fits all } - public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source1) throws Parser.Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI location, final String mimeType, + final String charset, final InputStream source1) + throws Parser.Failure, InterruptedException { - Document[] docs = new Document[]{new Document( + final Document[] docs = new Document[]{new Document( location, mimeType, charset, @@ -56,7 +62,9 @@ public class genericParser extends AbstractParser implements Parser { null, null, false)}; - for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs + for (final Document d: docs) { + assert d.getText() != null : "mimeType = " + mimeType; + } // verify docs return docs; } } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index c04768798..7d1b86372 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -3,6 +3,10 @@ * Copyright 2009 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 09.07.2009 at http://yacy.net * +// $LastChangedDate $ +// $LastChangedRevision $ +// $LastChangedBy $ + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either @@ -159,7 +163,11 @@ public class htmlParser extends AbstractParser implements Parser { private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) { final String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; int p = 0; - for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j]; + for (int i = 1; i <= 4; i++) { + for (final String headline : scraper.getHeadlines(i)) { + sections[p++] = headline; + } + } final Document[] ppds = new Document[]{new Document( location, mimeType, @@ -177,7 +185,9 @@ public class htmlParser extends AbstractParser implements Parser { scraper.getImages(), scraper.indexingDenied())}; //scraper.close(); - for (Document ppd: ppds) ppd.setFavicon(scraper.getFavicon()); + for (final Document ppd: ppds) { + ppd.setFavicon(scraper.getFavicon()); + } return ppds; } @@ -256,11 +266,7 @@ public class htmlParser extends AbstractParser implements Parser { return encoding; } - - public boolean indexingDenied() { - return false; - } - + public static void main(String[] args) { // test parsing of a url MultiProtocolURI url; diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 8f77a6564..b0e21f548 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -84,7 +84,9 @@ public class odtParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.add("application/OOo-writer"); } - private Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws Parser.Failure, InterruptedException { + private Document[] parse(final MultiProtocolURI location, final String mimeType, + final String charset, final File dest) + throws Parser.Failure, InterruptedException { CharBuffer writer = null; try { @@ -138,7 +140,7 @@ public class odtParser extends AbstractParser implements Parser { } // make the languages set - Set languages = new HashSet(1); + final Set languages = new HashSet(1); if (docLanguage != null) languages.add(docLanguage); // if there is no title availabe we generate one diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 04a0f1a3f..667ef9c1d 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -126,15 +126,13 @@ public class ooxmlParser extends AbstractParser implements Parser { } // make the languages set - Set languages = new HashSet(1); + final Set languages = new HashSet(1); if (docLanguage != null && docLanguage.length() == 0) languages.add(docLanguage); // if there is no title availabe we generate one - if (docLongTitle == null || docLongTitle.length() == 0) { - if (docShortTitle != null) { + if ((docLongTitle == null || docLongTitle.length() == 0) && (docShortTitle != null)) { docLongTitle = docShortTitle; - } } // split the keywords @@ -166,7 +164,9 @@ public class ooxmlParser extends AbstractParser implements Parser { if (e instanceof Parser.Failure) throw (Parser.Failure) e; // close the writer - if (writer != null) try { writer.close(); } catch (final Exception ex) {/* ignore this */} + if (writer != null) try { + writer.close(); + } catch (final Exception ex) {/* ignore this */} Log.logException(e); throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location); diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 250bc9cf7..7821eed67 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -53,7 +53,7 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; -public class pdfParser extends AbstractParser implements Parser { +public class pdfParser extends AbstractParser implements Parser { public pdfParser() { super("Acrobat Portable Document Parser"); @@ -118,7 +118,9 @@ public class pdfParser extends AbstractParser implements Parser { // info.getModificationDate(); } - if (docTitle == null || docTitle.length() == 0) docTitle = MultiProtocolURI.unescape(location.getFileName()); + if (docTitle == null || docTitle.length() == 0) { + docTitle = MultiProtocolURI.unescape(location.getFileName()); + } CharBuffer writer = null; try { // create a writer for output @@ -139,8 +141,12 @@ public class pdfParser extends AbstractParser implements Parser { pdfDoc = null; String[] docKeywords = null; - if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,"); - if (docTitle == null) docTitle = docSubject; + if (docKeywordStr != null) { + docKeywords = docKeywordStr.split(" |,"); + } + if (docTitle == null) { + docTitle = docSubject; + } byte[] contentBytes; try { diff --git a/source/net/yacy/document/parser/psParser.java b/source/net/yacy/document/parser/psParser.java index b881944d4..88e8cf3a0 100644 --- a/source/net/yacy/document/parser/psParser.java +++ b/source/net/yacy/document/parser/psParser.java @@ -63,7 +63,7 @@ public class psParser extends AbstractParser implements Parser { } } - public boolean testForPs2Ascii() { + private boolean testForPs2Ascii() { try { String procOutputLine = null; final StringBuilder procOutput = new StringBuilder(); @@ -83,7 +83,7 @@ public class psParser extends AbstractParser implements Parser { } - public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile) throws Parser.Failure, InterruptedException { + private Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile) throws Parser.Failure, InterruptedException { File outputFile = null; try { @@ -128,7 +128,7 @@ public class psParser extends AbstractParser implements Parser { } } - public void parseUsingJava(final File inputFile, final File outputFile) throws Exception { + private void parseUsingJava(final File inputFile, final File outputFile) throws Exception { BufferedReader reader = null; BufferedWriter writer = null; @@ -166,7 +166,7 @@ public class psParser extends AbstractParser implements Parser { } } - } else if (version.length() > 0 && version.charAt(0) == '3') { + } else if (version.length() > 0 && version.charAt(0) == '3') { final StringBuilder stmt = new StringBuilder(); boolean isBMP = false; boolean isStore = false; @@ -226,32 +226,34 @@ public class psParser extends AbstractParser implements Parser { int execCode = 0; StringBuilder procErr = null; try { - String procOutputLine = null; - final StringBuilder procOut = new StringBuilder(); - procErr = new StringBuilder(); - - final Process ps2asciiProc = Runtime.getRuntime().exec(new String[]{"ps2ascii", inputFile.getAbsolutePath(),outputFile.getAbsolutePath()}); - final BufferedReader stdOut = new BufferedReader(new InputStreamReader(ps2asciiProc.getInputStream())); - final BufferedReader stdErr = new BufferedReader(new InputStreamReader(ps2asciiProc.getErrorStream())); - while ((procOutputLine = stdOut.readLine()) != null) { - procOut.append(procOutputLine); - } - stdOut.close(); - while ((procOutputLine = stdErr.readLine()) != null) { - procErr.append(procOutputLine); - } - stdErr.close(); - execCode = ps2asciiProc.waitFor(); + String procOutputLine; + final StringBuilder procOut = new StringBuilder(); + procErr = new StringBuilder(); + + final Process ps2asciiProc = Runtime.getRuntime().exec(new String[]{"ps2ascii", inputFile.getAbsolutePath(),outputFile.getAbsolutePath()}); + final BufferedReader stdOut = new BufferedReader(new InputStreamReader(ps2asciiProc.getInputStream())); + final BufferedReader stdErr = new BufferedReader(new InputStreamReader(ps2asciiProc.getErrorStream())); + while ((procOutputLine = stdOut.readLine()) != null) { + procOut.append(procOutputLine); + } + stdOut.close(); + while ((procOutputLine = stdErr.readLine()) != null) { + procErr.append(procOutputLine); + } + stdErr.close(); + execCode = ps2asciiProc.waitFor(); } catch (final Exception e) { - final String errorMsg = "Unable to convert ps to ascii. " + e.getMessage(); - this.log.logSevere(errorMsg); - throw new Exception(errorMsg); + final String errorMsg = "Unable to convert ps to ascii. " + e.getMessage(); + this.log.logSevere(errorMsg); + throw new Exception(errorMsg); } if (execCode != 0) throw new Exception("Unable to convert ps to ascii. ps2ascii returned statuscode " + execCode + "\n" + procErr.toString()); } - public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI location, final String mimeType, + final String charset, final InputStream source) + throws Parser.Failure, InterruptedException { File tempFile = null; try { diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 72c1841f8..0c51d5cb1 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -3,6 +3,10 @@ * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 20.08.2010 at http://yacy.net * +// $LastChangedDate $ +// $LastChangedRevision $ +// $LastChangedBy $ + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either @@ -53,7 +57,9 @@ public class rssParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.add("application/atom+xml"); } - public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI url, final String mimeType, + final String charset, final InputStream source) + throws Failure, InterruptedException { RSSReader rssReader; try { rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source, RSSReader.Type.none); @@ -61,14 +67,14 @@ public class rssParser extends AbstractParser implements Parser { throw new Parser.Failure("Load error:" + e.getMessage(), url, e); } - RSSFeed feed = rssReader.getFeed(); + final RSSFeed feed = rssReader.getFeed(); //RSSMessage channel = feed.getChannel(); - List docs = new ArrayList(); + final List docs = new ArrayList(); MultiProtocolURI uri; Set languages; Map anchors; Document doc; - for (Hit item: feed) try { + for (final Hit item: feed) try { uri = new MultiProtocolURI(item.getLink()); languages = new HashSet(); languages.add(item.getLanguage()); @@ -95,7 +101,7 @@ public class rssParser extends AbstractParser implements Parser { continue; } - Document[] da = new Document[docs.size()]; + final Document[] da = new Document[docs.size()]; docs.toArray(da); return da; } diff --git a/source/net/yacy/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java index e1924bd49..846e2c06e 100644 --- a/source/net/yacy/document/parser/rtfParser.java +++ b/source/net/yacy/document/parser/rtfParser.java @@ -40,20 +40,21 @@ import net.yacy.document.Parser; public class rtfParser extends AbstractParser implements Parser { - public rtfParser() { - super("Rich Text Format Parser"); + public rtfParser() { + super("Rich Text Format Parser"); SUPPORTED_EXTENSIONS.add("rtf"); SUPPORTED_MIME_TYPES.add("text/rtf"); SUPPORTED_MIME_TYPES.add("text/richtext"); SUPPORTED_MIME_TYPES.add("application/rtf"); SUPPORTED_MIME_TYPES.add("application/x-rtf"); SUPPORTED_MIME_TYPES.add("application/x-soffice"); - } + } - public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI location, final String mimeType, + final String charset, final InputStream source) + throws Parser.Failure, InterruptedException { - - try { + try { final DefaultStyledDocument doc = new DefaultStyledDocument(); final RTFEditorKit theRtfEditorKit = new RTFEditorKit(); @@ -81,13 +82,12 @@ public class rtfParser extends AbstractParser implements Parser { null, null, false)}; - } - catch (final Exception e) { + } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; throw new Parser.Failure("Unexpected error while parsing rtf resource." + e.getMessage(),location); - } - } + } + } } diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index f22ddc8b2..bca71dce9 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -96,7 +96,7 @@ public class sevenzipParser extends AbstractParser implements Parser { // wrapper class to redirect output of standard ArchiveExtractCallback to serverLog // and parse the extracted content - public static class SZParserExtractCallback extends ArchiveExtractCallback { + private static class SZParserExtractCallback extends ArchiveExtractCallback { private final Log log; private ByteArrayOutputStream cfos = null; diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index a644509ee..04d34b075 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -3,6 +3,10 @@ * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 08.09.2010 at http://yacy.net * +// $LastChangedDate $ +// $LastChangedRevision $ +// $LastChangedBy $ + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either @@ -64,7 +68,9 @@ public class sitemapParser extends AbstractParser implements Parser { //SUPPORTED_EXTENSIONS.add("xml"); } - public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI url, final String mimeType, + final String charset, final InputStream source) + throws Failure, InterruptedException { SitemapReader sitemap; try { sitemap = new SitemapReader(source); @@ -72,10 +78,10 @@ public class sitemapParser extends AbstractParser implements Parser { throw new Parser.Failure("Load error:" + e.getMessage(), url); } - List docs = new ArrayList(); + final List docs = new ArrayList(); MultiProtocolURI uri; Document doc; - for (URLEntry item: sitemap) try { + for (final URLEntry item: sitemap) try { uri = new MultiProtocolURI(item.loc); doc = new Document( uri, @@ -134,7 +140,7 @@ public class sitemapParser extends AbstractParser implements Parser { } } - public static SitemapReader parse(InputStream stream) throws IOException { + public static SitemapReader parse(final InputStream stream) throws IOException { return new SitemapReader(stream); } @@ -145,46 +151,52 @@ public class sitemapParser extends AbstractParser implements Parser { */ public static class SitemapReader extends ArrayList { private static final long serialVersionUID = 1337L; - public SitemapReader(InputStream source) throws IOException { + public SitemapReader(final InputStream source) throws IOException { org.w3c.dom.Document doc; try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); } catch (ParserConfigurationException e) { throw new IOException (e); } catch (SAXParseException e) { throw new IOException (e); } catch (SAXException e) { throw new IOException (e); } - NodeList SitemapNodes = doc.getElementsByTagName("sitemap"); - for (int i = 0; i < SitemapNodes.getLength(); i++) { - String url = new SitemapEntry((Element) SitemapNodes.item(i)).url(); + NodeList sitemapNodes = doc.getElementsByTagName("sitemap"); + for (int i = 0; i < sitemapNodes.getLength(); i++) { + String url = new SitemapEntry((Element) sitemapNodes.item(i)).url(); if (url != null && url.length() > 0) { try { - SitemapReader r = parse(new DigestURI(url)); - for (URLEntry ue: r) this.add(ue); + final SitemapReader r = parse(new DigestURI(url)); + for (final URLEntry ue: r) this.add(ue); } catch (IOException e) {} } } - NodeList urlEntryNodes = doc.getElementsByTagName("url"); + final NodeList urlEntryNodes = doc.getElementsByTagName("url"); for (int i = 0; i < urlEntryNodes.getLength(); i++) { this.add(new URLEntry((Element) urlEntryNodes.item(i))); } } + @Override public String toString() { - StringBuilder sb = new StringBuilder(); - for (URLEntry entry: this) sb.append(entry.toString()); + final StringBuilder sb = new StringBuilder(); + for (final URLEntry entry: this) { + sb.append(entry.toString()); + } return sb.toString(); } } public static class URLEntry { public String loc, lastmod, changefreq, priority; - public URLEntry(Element element) { + + public URLEntry(final Element element) { loc = val(element, "loc", ""); lastmod = val(element, "lastmod", ""); changefreq = val(element, "changefreq", ""); priority = val(element, "priority", ""); } + public String url() { return this.loc; } - public Date lastmod(Date dflt) { + + public Date lastmod(final Date dflt) { try { return DateFormatter.parseISO8601(lastmod); } catch (final ParseException e) { @@ -195,14 +207,17 @@ public class sitemapParser extends AbstractParser implements Parser { public static class SitemapEntry { public String loc, lastmod; - public SitemapEntry(Element element) { + + public SitemapEntry(final Element element) { loc = val(element, "loc", ""); lastmod = val(element, "lastmod", ""); } + public String url() { return this.loc; } - public Date lastmod(Date dflt) { + + public Date lastmod(final Date dflt) { try { return DateFormatter.parseISO8601(lastmod); } catch (final ParseException e) { @@ -211,10 +226,10 @@ public class sitemapParser extends AbstractParser implements Parser { } } - private static String val(Element parent, String label, String dflt) { - Element e = (Element) parent.getElementsByTagName(label).item(0); + private static String val(final Element parent, final String label, final String dflt) { + final Element e = (Element) parent.getElementsByTagName(label).item(0); if (e == null) return dflt; - Node child = e.getFirstChild(); + final Node child = e.getFirstChild(); return (child instanceof CharacterData) ? ((CharacterData) child).getData() : dflt; } } diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 8963056c5..31f48734a 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -30,6 +30,7 @@ package net.yacy.document.parser; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; +import java.util.Map; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; @@ -76,7 +77,7 @@ public class swfParser extends AbstractParser implements Parser { final String[] sections = null; final String abstrct = null; //TreeSet images = null; - final HashMap anchors = new HashMap(); + final Map anchors = new HashMap(); int urls = 0; int urlStart = -1; int urlEnd = 0; diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index ee20156b6..d17938539 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -3,6 +3,10 @@ * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 29.6.2010 at http://yacy.net * +// $LastChangedDate $ +// $LastChangedRevision $ +// $LastChangedBy $ + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either @@ -52,7 +56,7 @@ public class tarParser extends AbstractParser implements Parser { public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { - List docacc = new ArrayList(); + final List docacc = new ArrayList(); Document[] subDocs = null; final String ext = url.getFileExtension().toLowerCase(); if (ext.equals("gz") || ext.equals("tgz")) { @@ -81,7 +85,7 @@ public class tarParser extends AbstractParser implements Parser { FileUtils.copy(tis, tmp, entry.getSize()); subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp); if (subDocs == null) continue; - for (Document d: subDocs) docacc.add(d); + for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { log.logWarning("tar parser entry " + name + ": " + e.getMessage()); } finally { diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index 9dcf6e3d8..46955a270 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -3,6 +3,10 @@ * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 03.01.2010 at http://yacy.net * +// $LastChangedDate $ +// $LastChangedRevision $ +// $LastChangedBy $ + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either @@ -50,38 +54,41 @@ public class torrentParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.add("application/x-bittorrent"); } - public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) + throws Parser.Failure, InterruptedException { byte[] b = null; try { b = FileUtils.read(source); } catch (IOException e1) { throw new Parser.Failure(e1.toString(), location); } - BDecoder bd = new BDecoder(b); - BObject bo = bd.parse(); + final BDecoder bd = new BDecoder(b); + final BObject bo = bd.parse(); if (bo == null) throw new Parser.Failure("BDecoder.parse returned null", location); if (bo.getType() != BType.dictionary) throw new Parser.Failure("BDecoder object is not a dictionary", location); - Map map = bo.getMap(); - BObject commento = map.get("comment"); - String comment = (commento == null) ? "" : new String(commento.getString()); + final Map map = bo.getMap(); + final BObject commento = map.get("comment"); + final String comment = (commento == null) ? "" : new String(commento.getString()); //Date creation = new Date(map.get("creation date").getInteger()); - BObject infoo = map.get("info"); - StringBuilder filenames = new StringBuilder(); + final BObject infoo = map.get("info"); + final StringBuilder filenames = new StringBuilder(); String title = ""; if (infoo != null) { - Map info = infoo.getMap(); - BObject fileso = info.get("files"); + final Map info = infoo.getMap(); + final BObject fileso = info.get("files"); if (fileso != null) { - List filelist = fileso.getList(); - for (BObject fo: filelist) { - BObject patho = fo.getMap().get("path"); + final List filelist = fileso.getList(); + for (final BObject fo: filelist) { + final BObject patho = fo.getMap().get("path"); if (patho != null) { - List l = patho.getList(); // one file may have several names - for (BObject fl: l) filenames.append(fl.toString()).append(" "); + final List l = patho.getList(); // one file may have several names + for (final BObject fl: l) { + filenames.append(fl.toString()).append(" "); + } } } } - BObject nameo = info.get("name"); + final BObject nameo = info.get("name"); if (nameo != null) title = new String(nameo.getString()); } if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName()); diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 266dea507..13ce6e4a8 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -58,7 +58,8 @@ public class vcfParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.add("text/x-vcalendar"); } - public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, final InputStream source) + throws Parser.Failure, InterruptedException { try { final StringBuilder parsedTitle = new StringBuilder(); @@ -223,26 +224,26 @@ public class vcfParser extends AbstractParser implements Parser { } } - public static final String decodeQuotedPrintable(final String s) { - if (s == null) return null; - final byte[] b = s.getBytes(); - final StringBuilder sb = new StringBuilder(); - for (int i = 0; i < b.length; i++) { - final int c = b[i]; - if (c == '=') { - try { - final int u = Character.digit((char) b[++i], 16); - final int l = Character.digit((char) b[++i], 16); - if (u == -1 || l == -1) throw new RuntimeException("bad quoted-printable encoding"); - sb.append((char) ((u << 4) + l)); - } catch (final ArrayIndexOutOfBoundsException e) { - throw new RuntimeException("bad quoted-printable encoding"); - } - } else { - sb.append((char) c); - } - } - return sb.toString(); - } + private String decodeQuotedPrintable(final String s) { + if (s == null) return null; + final byte[] b = s.getBytes(); + final StringBuilder sb = new StringBuilder(); + for (int i = 0; i < b.length; i++) { + final int c = b[i]; + if (c == '=') { + try { + final int u = Character.digit((char) b[++i], 16); + final int l = Character.digit((char) b[++i], 16); + if (u == -1 || l == -1) throw new RuntimeException("bad quoted-printable encoding"); + sb.append((char) ((u << 4) + l)); + } catch (final ArrayIndexOutOfBoundsException e) { + throw new RuntimeException("bad quoted-printable encoding"); + } + } else { + sb.append((char) c); + } + } + return sb.toString(); + } } diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java index ddf0d476f..141548ebc 100644 --- a/source/net/yacy/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -62,7 +62,8 @@ public class vsdParser extends AbstractParser implements Parser { * parses the source documents and returns a plasmaParserDocument containing * all extracted information about the parsed document */ - public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) + throws Parser.Failure, InterruptedException { Document theDoc = null; @@ -70,7 +71,7 @@ public class vsdParser extends AbstractParser implements Parser { String contents = ""; SummaryInformation summary = null; try { - VisioTextExtractor extractor = new VisioTextExtractor(source); + final VisioTextExtractor extractor = new VisioTextExtractor(source); contents = extractor.getText(); summary = extractor.getSummaryInformation(); } catch (Exception e) { @@ -89,7 +90,7 @@ public class vsdParser extends AbstractParser implements Parser { } String abstrct = null; - abstrct = ((contents.length() > 80)? contents.substring(0, 80):contents.trim()). + abstrct = ((contents.length() > 80)? contents.substring(0, 80) : contents.trim()). replaceAll("\r\n"," "). replaceAll("\n"," "). replaceAll("\r"," "). @@ -124,12 +125,12 @@ public class vsdParser extends AbstractParser implements Parser { this.log.logSevere(errorMsg); throw new Parser.Failure(errorMsg, location); } finally { - if (theDoc == null) { + if (theDoc == null) { // if an unexpected error occures just log the error and raise a new Parser.Failure final String errorMsg = "Unable to parse the vsd document '" + location + "': possibly out of memory"; this.log.logSevere(errorMsg); throw new Parser.Failure(errorMsg, location); - } + } } } diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index e1fecc8c2..266f3fae4 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -3,6 +3,10 @@ * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 29.6.2010 at http://yacy.net * +// $LastChangedDate $ +// $LastChangedRevision $ +// $LastChangedBy $ + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either @@ -54,9 +58,11 @@ public class zipParser extends AbstractParser implements Parser { SUPPORTED_MIME_TYPES.add("application/vnd.android.package-archive"); } - public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI url, final String mimeType, + final String charset, final InputStream source) + throws Parser.Failure, InterruptedException { Document[] docs = null; - List docacc = new ArrayList(); + final List docacc = new ArrayList(); ZipEntry entry; final ZipInputStream zis = new ZipInputStream(source); File tmp = null; @@ -76,7 +82,7 @@ public class zipParser extends AbstractParser implements Parser { FileUtils.copy(zis, tmp, entry.getSize()); docs = TextParser.parseSource(MultiProtocolURI.newURL(url, "#" + name), mime, null, tmp); if (docs == null) continue; - for (Document d: docs) docacc.add(d); + for (final Document d: docs) docacc.add(d); } catch (final Parser.Failure e) { log.logWarning("ZIP parser entry " + name + ": " + e.getMessage()); } finally { @@ -87,7 +93,7 @@ public class zipParser extends AbstractParser implements Parser { break; } } - if (docacc.size() == 0) return null; + if (docacc.isEmpty()) return null; return docacc.toArray(new Document[docacc.size()]); } }