/** * TextParser.java * Copyright 2009 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 09.07.2009 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . */ package net.yacy.document; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.parser.bzipParser; import net.yacy.document.parser.csvParser; import net.yacy.document.parser.docParser; import net.yacy.document.parser.genericParser; import net.yacy.document.parser.gzipParser; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.mmParser; import net.yacy.document.parser.odtParser; import net.yacy.document.parser.ooxmlParser; import net.yacy.document.parser.pdfParser; import net.yacy.document.parser.pptParser; import net.yacy.document.parser.psParser; import net.yacy.document.parser.rssParser; import net.yacy.document.parser.rtfParser; import net.yacy.document.parser.sevenzipParser; import net.yacy.document.parser.sidAudioParser; import net.yacy.document.parser.swfParser; import net.yacy.document.parser.tarParser; import net.yacy.document.parser.torrentParser; import net.yacy.document.parser.vcfParser; import net.yacy.document.parser.vsdParser; import net.yacy.document.parser.xlsParser; import net.yacy.document.parser.zipParser; import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.images.genericImageParser; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; public final class TextParser { private static final Log log = new Log("PARSER"); private static final Object v = new Object(); private static final Parser genericIdiom = new genericParser(); private static final Map mime2parser = new ConcurrentHashMap(); private static final Map ext2parser = new ConcurrentHashMap(); private static final Map ext2mime = new ConcurrentHashMap(); private static final Map denyMime = new ConcurrentHashMap(); private static final Map denyExtensionx = new ConcurrentHashMap(); static { initParser(new bzipParser()); initParser(new csvParser()); initParser(new docParser()); initParser(new gzipParser()); initParser(new htmlParser()); initParser(new genericImageParser()); initParser(new mmParser()); initParser(new odtParser()); initParser(new ooxmlParser()); initParser(new pdfParser()); initParser(new pptParser()); initParser(new psParser()); initParser(new rssParser()); initParser(new rtfParser()); initParser(new sevenzipParser()); initParser(new sidAudioParser()); initParser(new swfParser()); initParser(new tarParser()); initParser(new torrentParser()); initParser(new vcfParser()); initParser(new vsdParser()); initParser(new xlsParser()); initParser(new zipParser()); } public static Set parsers() { final Set c = new HashSet(); c.addAll(ext2parser.values()); c.addAll(mime2parser.values()); return c; } private static void initParser(final Parser parser) { String prototypeMime = null; for (final String mime: parser.supportedMimeTypes()) { // process the mime types final String mimeType = normalizeMimeType(mime); if (prototypeMime == null) prototypeMime = mimeType; final Parser p0 = mime2parser.get(mimeType); if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); mime2parser.put(mimeType, parser); Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName()); } if (prototypeMime != null) for (String ext: parser.supportedExtensions()) { ext = ext.toLowerCase(); final String s = ext2mime.get(ext); if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'."); ext2mime.put(ext, prototypeMime); } for (String ext: parser.supportedExtensions()) { // process the extensions ext = ext.toLowerCase(); final Parser p0 = ext2parser.get(ext); if (p0 != null) log.logSevere("parser for extension '" + ext + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); ext2parser.put(ext, parser); Log.logInfo("PARSER", "Parser for extension '" + ext + "': " + parser.getName()); } } public static Document[] parseSource( final MultiProtocolURI location, final String mimeType, final String charset, final File sourceFile, final boolean multipleVirtualDocs ) throws InterruptedException, Parser.Failure { BufferedInputStream sourceStream = null; Document[] docs = null; try { if (log.isFine()) log.logFine("Parsing '" + location + "' from file"); if (!sourceFile.exists() || !sourceFile.canRead() || sourceFile.length() == 0) { final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2)."; log.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new Parser.Failure(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream, multipleVirtualDocs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e); throw new Parser.Failure("Unexpected exception: " + e.getMessage(), location); } finally { if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {} } for (final Document d: docs) { assert d.getText() != null; } // verify docs return docs; } public static Document[] parseSource( final MultiProtocolURI location, String mimeType, final String charset, final byte[] content, final boolean multipleVirtualDocs ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array"); mimeType = normalizeMimeType(mimeType); List idioms = null; try { idioms = parsers(location, mimeType); } catch (final Parser.Failure e) { final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage(); log.logWarning(errorMsg); throw new Parser.Failure(errorMsg, location); } assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true, false); Document[] docs = parseSource(location, mimeType, idioms, charset, content); // finally enrich the docs set with virtual docs from the enclosed documents if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]); return docs; } public static Document[] parseSource( final MultiProtocolURI location, String mimeType, final String charset, final long contentLength, final InputStream sourceStream, final boolean multipleVirtualDocs ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); mimeType = normalizeMimeType(mimeType); List idioms = null; try { idioms = parsers(location, mimeType); } catch (final Parser.Failure e) { final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage(); log.logWarning(errorMsg); throw new Parser.Failure(errorMsg, location); } assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true, false); // if we do not have more than one parser or the content size is over MaxInt // then we use only one stream-oriented parser. if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { // use a specific stream-oriented parser return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream); } // in case that we know more parsers we first transform the content into a byte[] and use that as base // for a number of different parse attempts. byte[] b = null; try { b = FileUtils.read(sourceStream, (int) contentLength); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } Document[] docs = parseSource(location, mimeType, idioms, charset, b); // finally enrich the docs set with virtual docs from the enclosed documents if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]); return docs; } private static Document[] parseSource( final MultiProtocolURI location, final String mimeType, final Parser parser, final String charset, final long contentLength, final InputStream sourceStream ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); final String fileExt = location.getFileExtension(); final String documentCharset = htmlParser.patchCharsetEncoding(charset); assert parser != null; if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); try { final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream); for (final Document d: docs) { assert d.getText() != null; } // verify docs return docs; } catch (final Exception e) { throw new Parser.Failure("parser failed: " + parser.getName(), location); } } private static Document[] parseSource( final MultiProtocolURI location, final String mimeType, final List parsers, final String charset, final byte[] sourceArray ) throws Parser.Failure { final String fileExt = location.getFileExtension(); if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]"); final String documentCharset = htmlParser.patchCharsetEncoding(charset); assert !parsers.isEmpty(); Document[] docs = null; final HashMap failedParser = new HashMap(); if (MemoryControl.request(sourceArray.length * 6, false)) { for (final Parser parser: parsers) { try { docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray)); } catch (final Parser.Failure e) { failedParser.put(parser, e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); } catch (final Exception e) { failedParser.put(parser, new Parser.Failure(e.getMessage(), location)); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); } if (docs != null) break; } } if (docs == null) { if (failedParser.isEmpty()) { final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed."; //log.logWarning("Unable to parse '" + location + "'. " + errorMsg); throw new Parser.Failure(errorMsg, location); } else { String failedParsers = ""; for (final Map.Entry error: failedParser.entrySet()) { log.logWarning("tried parser '" + error.getKey().getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + error.getValue().getMessage(), error.getValue()); failedParsers += error.getKey().getName() + " "; } throw new Parser.Failure("All parser failed: " + failedParsers, location); } } for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs return docs; } /** * check if the parser supports the given content. * @param url * @param mimeType * @return returns null if the content is supported. If the content is not supported, return a error string. */ public static String supports(final MultiProtocolURI url, final String mimeType) { try { // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok. final List idioms = parsers(url, mimeType); return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.get(0).getName().equals(genericIdiom.getName()))) ? "no parser found" : null; } catch (final Parser.Failure e) { // in case that a parser is not available, return a error string describing the problem. return e.getMessage(); } } /** * find a parser for a given url and mime type * because mime types returned by web severs are sometimes wrong, we also compute the mime type again * from the extension that can be extracted from the url path. That means that there are 3 criteria * that can be used to select a parser: * - the given extension * - the given mime type * - the mime type computed from the extension * @param url the given url * @param mimeType the given mime type * @return a list of Idiom parsers that may be appropriate for the given criteria * @throws Parser.Failure */ private static List parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure { final List idioms = new ArrayList(2); // check extension String ext = url.getFileExtension(); Parser idiom; if (ext != null && ext.length() > 0) { ext = ext.toLowerCase(); if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url); idiom = ext2parser.get(ext); if (idiom != null) idioms.add(idiom); } // check given mime type if (mimeType1 != null) { mimeType1 = normalizeMimeType(mimeType1); if (denyMime.containsKey(mimeType1)) throw new Parser.Failure("mime type '" + mimeType1 + "' is denied (1)", url); idiom = mime2parser.get(mimeType1); if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom); } // check mime type computed from extension final String mimeType2 = ext2mime.get(ext); if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.add(idiom); // always add the generic parser idioms.add(genericIdiom); //if (idioms.isEmpty()) throw new Parser.Failure("no parser found for extension '" + ext + "' and mime type '" + mimeType1 + "'", url); return idioms; } public static String supportsMime(String mimeType) { if (mimeType == null) return null; mimeType = normalizeMimeType(mimeType); if (denyMime.containsKey(mimeType)) return "mime type '" + mimeType + "' is denied (2)"; if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available"; return null; } public static String supportsExtension(final MultiProtocolURI url) { final String ext = url.getFileExtension().toLowerCase(); if (ext == null || ext.length() == 0) return null; if (denyExtensionx.containsKey(ext)) return "file extension '" + ext + "' is denied (2)"; final String mimeType = ext2mime.get(ext); if (mimeType == null) return "no parser available"; final Parser idiom = mime2parser.get(mimeType); assert idiom != null; if (idiom == null) return "no parser available (internal error!)"; return null; } public static String mimeOf(final MultiProtocolURI url) { return mimeOf(url.getFileExtension()); } public static String mimeOf(final String ext) { return ext2mime.get(ext.toLowerCase()); } private static String normalizeMimeType(String mimeType) { if (mimeType == null) return "application/octet-stream"; mimeType = mimeType.toLowerCase(); final int pos = mimeType.indexOf(';'); return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim()); } public static void setDenyMime(final String denyList) { denyMime.clear(); String n; for (final String s: denyList.split(",")) { n = normalizeMimeType(s); if (n != null && n.length() > 0) denyMime.put(n, v); } } public static String getDenyMime() { String s = ""; for (final String d: denyMime.keySet()) s += d + ","; if (s.length() > 0) s = s.substring(0, s.length() - 1); return s; } public static void grantMime(final String mime, final boolean grant) { final String n = normalizeMimeType(mime); if (n == null || n.length() == 0) return; if (grant) denyMime.remove(n); else denyMime.put(n, v); } public static void setDenyExtension(final String denyList) { denyExtensionx.clear(); for (final String s: denyList.split(",")) denyExtensionx.put(s, v); } public static String getDenyExtension() { String s = ""; for (final String d: denyExtensionx.keySet()) s += d + ","; s = s.substring(0, s.length() - 1); return s; } public static void grantExtension(final String ext, final boolean grant) { if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v); } /** * produce virtual documents for each of the link that is contained in the document * @param document * @return */ public static Document[] virtualDocs(final Document document) { final ArrayList docs = new ArrayList(); docs.add(document); for (final Map.Entry link: document.getApplinks().entrySet()) { docs.add(genLinkDocs("application", link.getKey(), link.getValue(), document.getContentLanguages())); } for (final Map.Entry link: document.getAudiolinks().entrySet()) { docs.add(genLinkDocs("audio", link.getKey(), link.getValue(), document.getContentLanguages())); } for (final Map.Entry link: document.getVideolinks().entrySet()) { docs.add(genLinkDocs("video", link.getKey(), link.getValue(), document.getContentLanguages())); } for (final Entry link: document.getImages().entrySet()) { docs.add(genImageDocs(link.getValue())); } // finally return the list of documents return docs.toArray(new Document[docs.size()]); } private final static Document genLinkDocs(final String type, final MultiProtocolURI uri, final String descr, final Set contentLanguages) { //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr); return new Document( uri, Classification.ext2mime(uri.getFileExtension()), "UTF-8", null, contentLanguages, null, descr, "", "", new String[]{descr}, type, 0.0f, 0.0f, uri.toNormalform(false, false), null, null, null, false); } private final static Document genImageDocs(final ImageEntry img) { //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt()); return new Document( img.url(), Classification.ext2mime(img.url().getFileExtension()), "UTF-8", null, null, null, img.alt(), "", "", new String[]{img.alt()}, "image", 0.0f, 0.0f, img.url().toNormalform(false, false), null, null, null, false); } }