*) minor changes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7394 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
low012 14 years ago
parent e38217fe88
commit c0274bd123

@ -41,7 +41,7 @@ import net.yacy.kelondro.util.FileUtils;
import org.apache.tools.bzip2.CBZip2InputStream;
public class bzipParser extends AbstractParser implements Parser {
public class bzipParser extends AbstractParser implements Parser {
public bzipParser() {
super("Bzip 2 UNIX Compressed File Parser");
@ -55,7 +55,9 @@ public class bzipParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.add("application/x-stuffit");
}
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
File tempFile = null;
Document[] docs;

@ -3,6 +3,10 @@
* Copyright 2009 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 02.10.2009 at http://yacy.net
*
// $LastChangedDate $
// $LastChangedRevision $
// $LastChangedBy $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
@ -49,10 +53,12 @@ public class csvParser extends AbstractParser implements Parser {
// construct a document using all cells of the document
// the first row is used as headline
// all lines are artificially terminated by a '.' to separate them as sentence for the condenser.
List<String[]> table = getTable(location, mimeType, charset, source);
final List<String[]> table = getTable(location, mimeType, charset, source);
if (table.isEmpty()) throw new Parser.Failure("document has no lines", location);
StringBuilder sb = new StringBuilder();
for (String[] row: table) sb.append(concatRow(row)).append(' ');
final StringBuilder sb = new StringBuilder();
for (final String[] row: table) {
sb.append(concatRow(row)).append(' ');
}
try {
return new Document[]{new Document(
location,
@ -75,18 +81,18 @@ public class csvParser extends AbstractParser implements Parser {
}
}
public String concatRow(String[] column) {
StringBuilder sb = new StringBuilder(80);
for (int i = 0; i < column.length; i++) {
if (i != 0) sb.append(' ');
sb.append(column[i]);
private String concatRow(String[] columns) {
final StringBuilder sb = new StringBuilder(80);
for (final String column : columns) {
if (sb.length() > 0) sb.append(' ');
sb.append(column);
}
sb.append('.');
return sb.toString();
}
public List<String[]> getTable(MultiProtocolURI location, String mimeType, String charset, InputStream source) {
ArrayList<String[]> rows = new ArrayList<String[]>();
private List<String[]> getTable(MultiProtocolURI location, String mimeType, String charset, InputStream source) {
final List<String[]> rows = new ArrayList<String[]>();
BufferedReader reader;
try {
reader = new BufferedReader(new InputStreamReader(source, charset));
@ -102,16 +108,16 @@ public class csvParser extends AbstractParser implements Parser {
if (row.length() == 0) continue;
if (separator == null) {
// try comma, semicolon and tab; take that one that results with more columns
String[] colc = row.split(",");
String[] cols = row.split(";");
String[] colt = row.split("\t");
final String[] colc = row.split(",");
final String[] cols = row.split(";");
final String[] colt = row.split("\t");
if (colc.length >= cols.length && colc.length >= colt.length) separator = ",";
if (cols.length >= colc.length && cols.length >= colt.length) separator = ";";
if (colt.length >= cols.length && colt.length >= colc.length) separator = "\t";
}
row = stripQuotes(row, '\"', separator.charAt(0), ' ');
row = stripQuotes(row, '\'', separator.charAt(0), ' ');
String[] cols = row.split(separator);
final String[] cols = row.split(separator);
if (columns == -1) columns = cols.length;
//if (cols.length != columns) continue; // skip lines that have the wrong number of columns
rows.add(cols);
@ -130,19 +136,22 @@ public class csvParser extends AbstractParser implements Parser {
* @param replacement
* @return the line without the quotes
*/
public static String stripQuotes(String line, char quote, char separator, char replacement) {
private static String stripQuotes(final String line, final char quote,
final char separator, final char replacement) {
String ret = line;
int p, q;
// find left quote
while ((p = line.indexOf(quote)) >= 0) {
q = line.indexOf(quote, p + 1);
while ((p = ret.indexOf(quote)) >= 0) {
q = ret.indexOf(quote, p + 1);
if (q < 0) {
// there is only a single quote but no 'right' quote.
// This data is not well-formed. Just remove the quote and give up.
return line.substring(0, p) + line.substring(p + 1);
return ret.substring(0, p) + ret.substring(p + 1);
}
line = line.substring(0, p) + line.substring(p + 1, q).replace(separator, replacement) + line.substring(q + 1);
ret = ret.substring(0, p) + ret.substring(p + 1, q).replace(separator, replacement) + ret.substring(q + 1);
}
return line;
return ret;
}
}

@ -38,8 +38,8 @@ import org.apache.poi.hwpf.extractor.WordExtractor;
public class docParser extends AbstractParser implements Parser {
public docParser() {
super("Word Document Parser");
public docParser() {
super("Word Document Parser");
SUPPORTED_EXTENSIONS.add("doc");
SUPPORTED_MIME_TYPES.add("application/msword");
SUPPORTED_MIME_TYPES.add("application/doc");
@ -50,9 +50,11 @@ public class docParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.add("application/word");
SUPPORTED_MIME_TYPES.add("application/x-msw6");
SUPPORTED_MIME_TYPES.add("application/x-msword");
}
}
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
final WordExtractor extractor;
@ -62,7 +64,7 @@ public class docParser extends AbstractParser implements Parser {
throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
}
StringBuilder contents = new StringBuilder();
final StringBuilder contents = new StringBuilder();
try {
contents.append(extractor.getText().trim());
contents.append(" ");
@ -72,15 +74,15 @@ public class docParser extends AbstractParser implements Parser {
} catch (Exception e) {
throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
}
String title = (contents.length() > 240) ? contents.substring(0,240) : contents.toString().trim();
String title = (contents.length() > 240) ? contents.substring(0,240) : contents.toString().trim();
title.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
if (title.length() > 80) title = title.substring(0, 80);
int l = title.length();
while (true) {
title = title.replaceAll(" ", " ");
if (title.length() == l) break;
l = title.length();
}
if (title.length() > 80) title = title.substring(0, 80);
int l = title.length();
while (true) {
title = title.replaceAll(" ", " ");
if (title.length() == l) break;
l = title.length();
}
Document[] docs;
try {
@ -105,6 +107,6 @@ public class docParser extends AbstractParser implements Parser {
}
return docs;
}
}
}

@ -3,6 +3,10 @@
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 30.11.2010 at http://yacy.net
*
// $LastChangedDate $
// $LastChangedRevision $
// $LastChangedBy $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
@ -38,9 +42,11 @@ public class genericParser extends AbstractParser implements Parser {
// this parser is used if no other fits. This parser fits all
}
public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source1) throws Parser.Failure, InterruptedException {
public Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source1)
throws Parser.Failure, InterruptedException {
Document[] docs = new Document[]{new Document(
final Document[] docs = new Document[]{new Document(
location,
mimeType,
charset,
@ -56,7 +62,9 @@ public class genericParser extends AbstractParser implements Parser {
null,
null,
false)};
for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
for (final Document d: docs) {
assert d.getText() != null : "mimeType = " + mimeType;
} // verify docs
return docs;
}
}

@ -3,6 +3,10 @@
* Copyright 2009 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 09.07.2009 at http://yacy.net
*
// $LastChangedDate $
// $LastChangedRevision $
// $LastChangedBy $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
@ -159,7 +163,11 @@ public class htmlParser extends AbstractParser implements Parser {
private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
final String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
for (int i = 1; i <= 4; i++) {
for (final String headline : scraper.getHeadlines(i)) {
sections[p++] = headline;
}
}
final Document[] ppds = new Document[]{new Document(
location,
mimeType,
@ -177,7 +185,9 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getImages(),
scraper.indexingDenied())};
//scraper.close();
for (Document ppd: ppds) ppd.setFavicon(scraper.getFavicon());
for (final Document ppd: ppds) {
ppd.setFavicon(scraper.getFavicon());
}
return ppds;
}
@ -256,11 +266,7 @@ public class htmlParser extends AbstractParser implements Parser {
return encoding;
}
public boolean indexingDenied() {
return false;
}
public static void main(String[] args) {
// test parsing of a url
MultiProtocolURI url;

@ -84,7 +84,9 @@ public class odtParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.add("application/OOo-writer");
}
private Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws Parser.Failure, InterruptedException {
private Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final File dest)
throws Parser.Failure, InterruptedException {
CharBuffer writer = null;
try {
@ -138,7 +140,7 @@ public class odtParser extends AbstractParser implements Parser {
}
// make the languages set
Set<String> languages = new HashSet<String>(1);
final Set<String> languages = new HashSet<String>(1);
if (docLanguage != null) languages.add(docLanguage);
// if there is no title availabe we generate one

@ -126,15 +126,13 @@ public class ooxmlParser extends AbstractParser implements Parser {
}
// make the languages set
Set<String> languages = new HashSet<String>(1);
final Set<String> languages = new HashSet<String>(1);
if (docLanguage != null && docLanguage.length() == 0)
languages.add(docLanguage);
// if there is no title availabe we generate one
if (docLongTitle == null || docLongTitle.length() == 0) {
if (docShortTitle != null) {
if ((docLongTitle == null || docLongTitle.length() == 0) && (docShortTitle != null)) {
docLongTitle = docShortTitle;
}
}
// split the keywords
@ -166,7 +164,9 @@ public class ooxmlParser extends AbstractParser implements Parser {
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {/* ignore this */}
if (writer != null) try {
writer.close();
} catch (final Exception ex) {/* ignore this */}
Log.logException(e);
throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(),location);

@ -53,7 +53,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
public class pdfParser extends AbstractParser implements Parser {
public class pdfParser extends AbstractParser implements Parser {
public pdfParser() {
super("Acrobat Portable Document Parser");
@ -118,7 +118,9 @@ public class pdfParser extends AbstractParser implements Parser {
// info.getModificationDate();
}
if (docTitle == null || docTitle.length() == 0) docTitle = MultiProtocolURI.unescape(location.getFileName());
if (docTitle == null || docTitle.length() == 0) {
docTitle = MultiProtocolURI.unescape(location.getFileName());
}
CharBuffer writer = null;
try {
// create a writer for output
@ -139,8 +141,12 @@ public class pdfParser extends AbstractParser implements Parser {
pdfDoc = null;
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
if (docTitle == null) docTitle = docSubject;
if (docKeywordStr != null) {
docKeywords = docKeywordStr.split(" |,");
}
if (docTitle == null) {
docTitle = docSubject;
}
byte[] contentBytes;
try {

@ -63,7 +63,7 @@ public class psParser extends AbstractParser implements Parser {
}
}
public boolean testForPs2Ascii() {
private boolean testForPs2Ascii() {
try {
String procOutputLine = null;
final StringBuilder procOutput = new StringBuilder();
@ -83,7 +83,7 @@ public class psParser extends AbstractParser implements Parser {
}
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile) throws Parser.Failure, InterruptedException {
private Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile) throws Parser.Failure, InterruptedException {
File outputFile = null;
try {
@ -128,7 +128,7 @@ public class psParser extends AbstractParser implements Parser {
}
}
public void parseUsingJava(final File inputFile, final File outputFile) throws Exception {
private void parseUsingJava(final File inputFile, final File outputFile) throws Exception {
BufferedReader reader = null;
BufferedWriter writer = null;
@ -166,7 +166,7 @@ public class psParser extends AbstractParser implements Parser {
}
}
} else if (version.length() > 0 && version.charAt(0) == '3') {
} else if (version.length() > 0 && version.charAt(0) == '3') {
final StringBuilder stmt = new StringBuilder();
boolean isBMP = false;
boolean isStore = false;
@ -226,32 +226,34 @@ public class psParser extends AbstractParser implements Parser {
int execCode = 0;
StringBuilder procErr = null;
try {
String procOutputLine = null;
final StringBuilder procOut = new StringBuilder();
procErr = new StringBuilder();
final Process ps2asciiProc = Runtime.getRuntime().exec(new String[]{"ps2ascii", inputFile.getAbsolutePath(),outputFile.getAbsolutePath()});
final BufferedReader stdOut = new BufferedReader(new InputStreamReader(ps2asciiProc.getInputStream()));
final BufferedReader stdErr = new BufferedReader(new InputStreamReader(ps2asciiProc.getErrorStream()));
while ((procOutputLine = stdOut.readLine()) != null) {
procOut.append(procOutputLine);
}
stdOut.close();
while ((procOutputLine = stdErr.readLine()) != null) {
procErr.append(procOutputLine);
}
stdErr.close();
execCode = ps2asciiProc.waitFor();
String procOutputLine;
final StringBuilder procOut = new StringBuilder();
procErr = new StringBuilder();
final Process ps2asciiProc = Runtime.getRuntime().exec(new String[]{"ps2ascii", inputFile.getAbsolutePath(),outputFile.getAbsolutePath()});
final BufferedReader stdOut = new BufferedReader(new InputStreamReader(ps2asciiProc.getInputStream()));
final BufferedReader stdErr = new BufferedReader(new InputStreamReader(ps2asciiProc.getErrorStream()));
while ((procOutputLine = stdOut.readLine()) != null) {
procOut.append(procOutputLine);
}
stdOut.close();
while ((procOutputLine = stdErr.readLine()) != null) {
procErr.append(procOutputLine);
}
stdErr.close();
execCode = ps2asciiProc.waitFor();
} catch (final Exception e) {
final String errorMsg = "Unable to convert ps to ascii. " + e.getMessage();
this.log.logSevere(errorMsg);
throw new Exception(errorMsg);
final String errorMsg = "Unable to convert ps to ascii. " + e.getMessage();
this.log.logSevere(errorMsg);
throw new Exception(errorMsg);
}
if (execCode != 0) throw new Exception("Unable to convert ps to ascii. ps2ascii returned statuscode " + execCode + "\n" + procErr.toString());
}
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
File tempFile = null;
try {

@ -3,6 +3,10 @@
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 20.08.2010 at http://yacy.net
*
// $LastChangedDate $
// $LastChangedRevision $
// $LastChangedBy $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
@ -53,7 +57,9 @@ public class rssParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.add("application/atom+xml");
}
public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
public Document[] parse(final MultiProtocolURI url, final String mimeType,
final String charset, final InputStream source)
throws Failure, InterruptedException {
RSSReader rssReader;
try {
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source, RSSReader.Type.none);
@ -61,14 +67,14 @@ public class rssParser extends AbstractParser implements Parser {
throw new Parser.Failure("Load error:" + e.getMessage(), url, e);
}
RSSFeed feed = rssReader.getFeed();
final RSSFeed feed = rssReader.getFeed();
//RSSMessage channel = feed.getChannel();
List<Document> docs = new ArrayList<Document>();
final List<Document> docs = new ArrayList<Document>();
MultiProtocolURI uri;
Set<String> languages;
Map<MultiProtocolURI, String> anchors;
Document doc;
for (Hit item: feed) try {
for (final Hit item: feed) try {
uri = new MultiProtocolURI(item.getLink());
languages = new HashSet<String>();
languages.add(item.getLanguage());
@ -95,7 +101,7 @@ public class rssParser extends AbstractParser implements Parser {
continue;
}
Document[] da = new Document[docs.size()];
final Document[] da = new Document[docs.size()];
docs.toArray(da);
return da;
}

@ -40,20 +40,21 @@ import net.yacy.document.Parser;
public class rtfParser extends AbstractParser implements Parser {
public rtfParser() {
super("Rich Text Format Parser");
public rtfParser() {
super("Rich Text Format Parser");
SUPPORTED_EXTENSIONS.add("rtf");
SUPPORTED_MIME_TYPES.add("text/rtf");
SUPPORTED_MIME_TYPES.add("text/richtext");
SUPPORTED_MIME_TYPES.add("application/rtf");
SUPPORTED_MIME_TYPES.add("application/x-rtf");
SUPPORTED_MIME_TYPES.add("application/x-soffice");
}
}
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
try {
try {
final DefaultStyledDocument doc = new DefaultStyledDocument();
final RTFEditorKit theRtfEditorKit = new RTFEditorKit();
@ -81,13 +82,12 @@ public class rtfParser extends AbstractParser implements Parser {
null,
null,
false)};
}
catch (final Exception e) {
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
throw new Parser.Failure("Unexpected error while parsing rtf resource." + e.getMessage(),location);
}
}
}
}
}

@ -96,7 +96,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
// and parse the extracted content
public static class SZParserExtractCallback extends ArchiveExtractCallback {
private static class SZParserExtractCallback extends ArchiveExtractCallback {
private final Log log;
private ByteArrayOutputStream cfos = null;

@ -3,6 +3,10 @@
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 08.09.2010 at http://yacy.net
*
// $LastChangedDate $
// $LastChangedRevision $
// $LastChangedBy $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
@ -64,7 +68,9 @@ public class sitemapParser extends AbstractParser implements Parser {
//SUPPORTED_EXTENSIONS.add("xml");
}
public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
public Document[] parse(final MultiProtocolURI url, final String mimeType,
final String charset, final InputStream source)
throws Failure, InterruptedException {
SitemapReader sitemap;
try {
sitemap = new SitemapReader(source);
@ -72,10 +78,10 @@ public class sitemapParser extends AbstractParser implements Parser {
throw new Parser.Failure("Load error:" + e.getMessage(), url);
}
List<Document> docs = new ArrayList<Document>();
final List<Document> docs = new ArrayList<Document>();
MultiProtocolURI uri;
Document doc;
for (URLEntry item: sitemap) try {
for (final URLEntry item: sitemap) try {
uri = new MultiProtocolURI(item.loc);
doc = new Document(
uri,
@ -134,7 +140,7 @@ public class sitemapParser extends AbstractParser implements Parser {
}
}
public static SitemapReader parse(InputStream stream) throws IOException {
public static SitemapReader parse(final InputStream stream) throws IOException {
return new SitemapReader(stream);
}
@ -145,46 +151,52 @@ public class sitemapParser extends AbstractParser implements Parser {
*/
public static class SitemapReader extends ArrayList<URLEntry> {
private static final long serialVersionUID = 1337L;
public SitemapReader(InputStream source) throws IOException {
public SitemapReader(final InputStream source) throws IOException {
org.w3c.dom.Document doc;
try { doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(source); }
catch (ParserConfigurationException e) { throw new IOException (e); }
catch (SAXParseException e) { throw new IOException (e); }
catch (SAXException e) { throw new IOException (e); }
NodeList SitemapNodes = doc.getElementsByTagName("sitemap");
for (int i = 0; i < SitemapNodes.getLength(); i++) {
String url = new SitemapEntry((Element) SitemapNodes.item(i)).url();
NodeList sitemapNodes = doc.getElementsByTagName("sitemap");
for (int i = 0; i < sitemapNodes.getLength(); i++) {
String url = new SitemapEntry((Element) sitemapNodes.item(i)).url();
if (url != null && url.length() > 0) {
try {
SitemapReader r = parse(new DigestURI(url));
for (URLEntry ue: r) this.add(ue);
final SitemapReader r = parse(new DigestURI(url));
for (final URLEntry ue: r) this.add(ue);
} catch (IOException e) {}
}
}
NodeList urlEntryNodes = doc.getElementsByTagName("url");
final NodeList urlEntryNodes = doc.getElementsByTagName("url");
for (int i = 0; i < urlEntryNodes.getLength(); i++) {
this.add(new URLEntry((Element) urlEntryNodes.item(i)));
}
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (URLEntry entry: this) sb.append(entry.toString());
final StringBuilder sb = new StringBuilder();
for (final URLEntry entry: this) {
sb.append(entry.toString());
}
return sb.toString();
}
}
public static class URLEntry {
public String loc, lastmod, changefreq, priority;
public URLEntry(Element element) {
public URLEntry(final Element element) {
loc = val(element, "loc", "");
lastmod = val(element, "lastmod", "");
changefreq = val(element, "changefreq", "");
priority = val(element, "priority", "");
}
public String url() {
return this.loc;
}
public Date lastmod(Date dflt) {
public Date lastmod(final Date dflt) {
try {
return DateFormatter.parseISO8601(lastmod);
} catch (final ParseException e) {
@ -195,14 +207,17 @@ public class sitemapParser extends AbstractParser implements Parser {
public static class SitemapEntry {
public String loc, lastmod;
public SitemapEntry(Element element) {
public SitemapEntry(final Element element) {
loc = val(element, "loc", "");
lastmod = val(element, "lastmod", "");
}
public String url() {
return this.loc;
}
public Date lastmod(Date dflt) {
public Date lastmod(final Date dflt) {
try {
return DateFormatter.parseISO8601(lastmod);
} catch (final ParseException e) {
@ -211,10 +226,10 @@ public class sitemapParser extends AbstractParser implements Parser {
}
}
private static String val(Element parent, String label, String dflt) {
Element e = (Element) parent.getElementsByTagName(label).item(0);
private static String val(final Element parent, final String label, final String dflt) {
final Element e = (Element) parent.getElementsByTagName(label).item(0);
if (e == null) return dflt;
Node child = e.getFirstChild();
final Node child = e.getFirstChild();
return (child instanceof CharacterData) ? ((CharacterData) child).getData() : dflt;
}
}

@ -30,6 +30,7 @@ package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
@ -76,7 +77,7 @@ public class swfParser extends AbstractParser implements Parser {
final String[] sections = null;
final String abstrct = null;
//TreeSet images = null;
final HashMap<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
int urls = 0;
int urlStart = -1;
int urlEnd = 0;

@ -3,6 +3,10 @@
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 29.6.2010 at http://yacy.net
*
// $LastChangedDate $
// $LastChangedRevision $
// $LastChangedBy $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
@ -52,7 +56,7 @@ public class tarParser extends AbstractParser implements Parser {
public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException {
List<Document> docacc = new ArrayList<Document>();
final List<Document> docacc = new ArrayList<Document>();
Document[] subDocs = null;
final String ext = url.getFileExtension().toLowerCase();
if (ext.equals("gz") || ext.equals("tgz")) {
@ -81,7 +85,7 @@ public class tarParser extends AbstractParser implements Parser {
FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp);
if (subDocs == null) continue;
for (Document d: subDocs) docacc.add(d);
for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) {
log.logWarning("tar parser entry " + name + ": " + e.getMessage());
} finally {

@ -3,6 +3,10 @@
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 03.01.2010 at http://yacy.net
*
// $LastChangedDate $
// $LastChangedRevision $
// $LastChangedBy $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
@ -50,38 +54,41 @@ public class torrentParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.add("application/x-bittorrent");
}
public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source)
throws Parser.Failure, InterruptedException {
byte[] b = null;
try {
b = FileUtils.read(source);
} catch (IOException e1) {
throw new Parser.Failure(e1.toString(), location);
}
BDecoder bd = new BDecoder(b);
BObject bo = bd.parse();
final BDecoder bd = new BDecoder(b);
final BObject bo = bd.parse();
if (bo == null) throw new Parser.Failure("BDecoder.parse returned null", location);
if (bo.getType() != BType.dictionary) throw new Parser.Failure("BDecoder object is not a dictionary", location);
Map<String, BObject> map = bo.getMap();
BObject commento = map.get("comment");
String comment = (commento == null) ? "" : new String(commento.getString());
final Map<String, BObject> map = bo.getMap();
final BObject commento = map.get("comment");
final String comment = (commento == null) ? "" : new String(commento.getString());
//Date creation = new Date(map.get("creation date").getInteger());
BObject infoo = map.get("info");
StringBuilder filenames = new StringBuilder();
final BObject infoo = map.get("info");
final StringBuilder filenames = new StringBuilder();
String title = "";
if (infoo != null) {
Map<String, BObject> info = infoo.getMap();
BObject fileso = info.get("files");
final Map<String, BObject> info = infoo.getMap();
final BObject fileso = info.get("files");
if (fileso != null) {
List<BObject> filelist = fileso.getList();
for (BObject fo: filelist) {
BObject patho = fo.getMap().get("path");
final List<BObject> filelist = fileso.getList();
for (final BObject fo: filelist) {
final BObject patho = fo.getMap().get("path");
if (patho != null) {
List<BObject> l = patho.getList(); // one file may have several names
for (BObject fl: l) filenames.append(fl.toString()).append(" ");
final List<BObject> l = patho.getList(); // one file may have several names
for (final BObject fl: l) {
filenames.append(fl.toString()).append(" ");
}
}
}
}
BObject nameo = info.get("name");
final BObject nameo = info.get("name");
if (nameo != null) title = new String(nameo.getString());
}
if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName());

@ -58,7 +58,8 @@ public class vcfParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.add("text/x-vcalendar");
}
public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
try {
final StringBuilder parsedTitle = new StringBuilder();
@ -223,26 +224,26 @@ public class vcfParser extends AbstractParser implements Parser {
}
}
public static final String decodeQuotedPrintable(final String s) {
if (s == null) return null;
final byte[] b = s.getBytes();
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < b.length; i++) {
final int c = b[i];
if (c == '=') {
try {
final int u = Character.digit((char) b[++i], 16);
final int l = Character.digit((char) b[++i], 16);
if (u == -1 || l == -1) throw new RuntimeException("bad quoted-printable encoding");
sb.append((char) ((u << 4) + l));
} catch (final ArrayIndexOutOfBoundsException e) {
throw new RuntimeException("bad quoted-printable encoding");
}
} else {
sb.append((char) c);
}
}
return sb.toString();
}
private String decodeQuotedPrintable(final String s) {
if (s == null) return null;
final byte[] b = s.getBytes();
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < b.length; i++) {
final int c = b[i];
if (c == '=') {
try {
final int u = Character.digit((char) b[++i], 16);
final int l = Character.digit((char) b[++i], 16);
if (u == -1 || l == -1) throw new RuntimeException("bad quoted-printable encoding");
sb.append((char) ((u << 4) + l));
} catch (final ArrayIndexOutOfBoundsException e) {
throw new RuntimeException("bad quoted-printable encoding");
}
} else {
sb.append((char) c);
}
}
return sb.toString();
}
}

@ -62,7 +62,8 @@ public class vsdParser extends AbstractParser implements Parser {
* parses the source documents and returns a plasmaParserDocument containing
* all extracted information about the parsed document
*/
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
Document theDoc = null;
@ -70,7 +71,7 @@ public class vsdParser extends AbstractParser implements Parser {
String contents = "";
SummaryInformation summary = null;
try {
VisioTextExtractor extractor = new VisioTextExtractor(source);
final VisioTextExtractor extractor = new VisioTextExtractor(source);
contents = extractor.getText();
summary = extractor.getSummaryInformation();
} catch (Exception e) {
@ -89,7 +90,7 @@ public class vsdParser extends AbstractParser implements Parser {
}
String abstrct = null;
abstrct = ((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
abstrct = ((contents.length() > 80)? contents.substring(0, 80) : contents.trim()).
replaceAll("\r\n"," ").
replaceAll("\n"," ").
replaceAll("\r"," ").
@ -124,12 +125,12 @@ public class vsdParser extends AbstractParser implements Parser {
this.log.logSevere(errorMsg);
throw new Parser.Failure(errorMsg, location);
} finally {
if (theDoc == null) {
if (theDoc == null) {
// if an unexpected error occures just log the error and raise a new Parser.Failure
final String errorMsg = "Unable to parse the vsd document '" + location + "': possibly out of memory";
this.log.logSevere(errorMsg);
throw new Parser.Failure(errorMsg, location);
}
}
}
}

@ -3,6 +3,10 @@
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 29.6.2010 at http://yacy.net
*
// $LastChangedDate $
// $LastChangedRevision $
// $LastChangedBy $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
@ -54,9 +58,11 @@ public class zipParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.add("application/vnd.android.package-archive");
}
public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
public Document[] parse(final MultiProtocolURI url, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
Document[] docs = null;
List<Document> docacc = new ArrayList<Document>();
final List<Document> docacc = new ArrayList<Document>();
ZipEntry entry;
final ZipInputStream zis = new ZipInputStream(source);
File tmp = null;
@ -76,7 +82,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize());
docs = TextParser.parseSource(MultiProtocolURI.newURL(url, "#" + name), mime, null, tmp);
if (docs == null) continue;
for (Document d: docs) docacc.add(d);
for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) {
log.logWarning("ZIP parser entry " + name + ": " + e.getMessage());
} finally {
@ -87,7 +93,7 @@ public class zipParser extends AbstractParser implements Parser {
break;
}
}
if (docacc.size() == 0) return null;
if (docacc.isEmpty()) return null;
return docacc.toArray(new Document[docacc.size()]);
}
}

Loading…
Cancel
Save