allow multiple parser options instead of printing an error

13 years ago · 50c576599b
parent c02d742e53
commit 50c576599b
5 changed files with 113 additions and 164 deletions
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -26,10 +26,8 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
@ -75,8 +73,8 @@ public final class TextParser {
    private static final Object v = new Object();

    private static final Parser genericIdiom = new genericParser();
-    private static final Map<String, Parser> mime2parser = new ConcurrentHashMap<String, Parser>();
-    private static final Map<String, Parser> ext2parser = new ConcurrentHashMap<String, Parser>();
+    private static final Map<String, Set<Parser>> mime2parser = new ConcurrentHashMap<String, Set<Parser>>();
+    private static final Map<String, Set<Parser>> ext2parser = new ConcurrentHashMap<String, Set<Parser>>();
    private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>();
    private static final Map<String, Object> denyMime = new ConcurrentHashMap<String, Object>();
    private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<String, Object>();
@ -86,7 +84,7 @@ public final class TextParser {
        initParser(new csvParser());
        initParser(new docParser());
        initParser(new gzipParser());
-        initParser(new htmlParser("HTML Parser"));
+        initParser(new htmlParser());
        initParser(new genericImageParser());
        initParser(new mmParser());
        initParser(new odtParser());
@ -105,17 +103,17 @@ public final class TextParser {
        initParser(new vsdParser());
        initParser(new xlsParser());
        initParser(new zipParser());
-        initParser(new RDFaParser("RDFa Parser"));
+        initParser(new RDFaParser());
        initParser(new rdfParser());

-        if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation.RDFa", true)) initParser(new RDFaParser("RDFa Parser"));
-        if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation", true)) initParser(new AugmentParser("Augment Parser"));
+        if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation.RDFa", true)) initParser(new RDFaParser());
+        if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation", true)) initParser(new AugmentParser());
    }

    public static Set<Parser> parsers() {
        final Set<Parser> c = new HashSet<Parser>();
-        c.addAll(ext2parser.values());
-        c.addAll(mime2parser.values());
+        for (Set<Parser> pl: ext2parser.values()) c.addAll(pl);
+        for (Set<Parser> pl: mime2parser.values()) c.addAll(pl);
        return c;
    }

@ -125,25 +123,31 @@ public final class TextParser {
            // process the mime types
            final String mimeType = normalizeMimeType(mime);
            if (prototypeMime == null) prototypeMime = mimeType;
-            final Parser p0 = mime2parser.get(mimeType);
-            if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
-            mime2parser.put(mimeType, parser);
+            Set<Parser> p0 = mime2parser.get(mimeType);
+            if (p0 == null) {
+                p0 = new HashSet<Parser>();
+                mime2parser.put(mimeType, p0);
+            }
+            p0.add(parser);
            Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());
        }

        if (prototypeMime != null) for (String ext: parser.supportedExtensions()) {
            ext = ext.toLowerCase();
            final String s = ext2mime.get(ext);
-            if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
+            if (s != null && !s.equals(prototypeMime)) log.logWarning("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
            ext2mime.put(ext, prototypeMime);
        }

        for (String ext: parser.supportedExtensions()) {
            // process the extensions
            ext = ext.toLowerCase();
-            final Parser p0 = ext2parser.get(ext);
-            if (p0 != null) log.logSevere("parser for extension '" + ext + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
-            ext2parser.put(ext, parser);
+            Set<Parser> p0 = ext2parser.get(ext);
+            if (p0 == null) {
+                p0 = new HashSet<Parser>();
+                ext2parser.put(ext, p0);
+            }
+            p0.add(parser);
            Log.logInfo("PARSER", "Parser for extension '" + ext + "': " + parser.getName());
        }
    }
@ -187,7 +191,7 @@ public final class TextParser {
        ) throws Parser.Failure {
        if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
        mimeType = normalizeMimeType(mimeType);
-        List<Parser> idioms = null;
+        Set<Parser> idioms = null;
        try {
            idioms = parsers(location, mimeType);
        } catch (final Parser.Failure e) {
@ -211,7 +215,7 @@ public final class TextParser {
        ) throws Parser.Failure {
        if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
        mimeType = normalizeMimeType(mimeType);
-        List<Parser> idioms = null;
+        Set<Parser> idioms = null;
        try {
            idioms = parsers(location, mimeType);
        } catch (final Parser.Failure e) {
@ -225,7 +229,7 @@ public final class TextParser {
        // then we use only one stream-oriented parser.
        if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
            // use a specific stream-oriented parser
-            return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
+            return parseSource(location, mimeType, idioms.iterator().next(), charset, contentLength, sourceStream);
        }

        // in case that we know more parsers we first transform the content into a byte[] and use that as base
@ -267,7 +271,7 @@ public final class TextParser {
    private static Document[] parseSource(
            final DigestURI location,
            final String mimeType,
-            final List<Parser> parsers,
+            final Set<Parser> parsers,
            final String charset,
            final byte[] sourceArray
        ) throws Parser.Failure {
@ -334,8 +338,8 @@ public final class TextParser {
    public static String supports(final MultiProtocolURI url, final String mimeType) {
        try {
            // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
-            final List<Parser> idioms = parsers(url, mimeType);
-            return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.get(0).getName().equals(genericIdiom.getName()))) ? "no parser found" : null;
+            final Set<Parser> idioms = parsers(url, mimeType);
+            return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.iterator().next().getName().equals(genericIdiom.getName()))) ? "no parser found" : null;
        } catch (final Parser.Failure e) {
            // in case that a parser is not available, return a error string describing the problem.
            return e.getMessage();
@ -355,17 +359,17 @@ public final class TextParser {
     * @return a list of Idiom parsers that may be appropriate for the given criteria
     * @throws Parser.Failure
     */
-    private static List<Parser> parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure {
-        final List<Parser> idioms = new ArrayList<Parser>(2);
+    private static Set<Parser> parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure {
+        final Set<Parser> idioms = new HashSet<Parser>(2);

        // check extension
        String ext = url.getFileExtension();
-        Parser idiom;
+        Set<Parser> idiom;
        if (ext != null && ext.length() > 0) {
            ext = ext.toLowerCase();
            if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
            idiom = ext2parser.get(ext);
-            if (idiom != null) idioms.add(idiom);
+            if (idiom != null) idioms.addAll(idiom);
        }

        // check given mime type
@ -373,12 +377,12 @@ public final class TextParser {
            mimeType1 = normalizeMimeType(mimeType1);
            if (denyMime.containsKey(mimeType1)) throw new Parser.Failure("mime type '" + mimeType1 + "' is denied (1)", url);
            idiom = mime2parser.get(mimeType1);
-            if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom);
+            if (idiom != null && !idioms.contains(idiom)) idioms.addAll(idiom);
        }

        // check mime type computed from extension
        final String mimeType2 = ext2mime.get(ext);
-        if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.add(idiom);
+        if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.addAll(idiom);

        // always add the generic parser
        idioms.add(genericIdiom);
@ -412,9 +416,9 @@ public final class TextParser {
        if (denyExtensionx.containsKey(ext)) return "file extension '" + ext + "' is denied (2)";
        final String mimeType = ext2mime.get(ext);
        if (mimeType == null) return "no parser available";
-        final Parser idiom = mime2parser.get(mimeType);
+        final Set<Parser> idiom = mime2parser.get(mimeType);
        assert idiom != null;
-        if (idiom == null) return "no parser available (internal error!)";
+        if (idiom == null || idiom.size() == 0) return "no parser available (internal error!)";
        return null;
    }

--- a/source/net/yacy/document/parser/augment/AugmentParser.java
+++ b/source/net/yacy/document/parser/augment/AugmentParser.java
@ -6,45 +6,24 @@ import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Set;

+import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
+import net.yacy.document.Parser;
 import net.yacy.document.parser.rdfa.impl.RDFaParser;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.search.Switchboard;
 import de.anomic.data.ymark.YMarkUtil;


-public class AugmentParser extends RDFaParser {
+public class AugmentParser extends AbstractParser implements Parser {

-	public AugmentParser(String name) {
-		super(name);
+    RDFaParser rdfaParser;

-		System.out.println("augmented parser was initialized");
+	public AugmentParser() {
+		super("AugmentParser");
+		this.rdfaParser = new RDFaParser();

-		this.SUPPORTED_EXTENSIONS.remove("htm");
-		this.SUPPORTED_EXTENSIONS.remove("html");
-		this.SUPPORTED_EXTENSIONS.remove("shtml");
-		this.SUPPORTED_EXTENSIONS.remove("xhtml");
-		this.SUPPORTED_EXTENSIONS.remove("php");
-		this.SUPPORTED_EXTENSIONS.remove("php3");
-		this.SUPPORTED_EXTENSIONS.remove("php4");
-		this.SUPPORTED_EXTENSIONS.remove("php5");
-		this.SUPPORTED_EXTENSIONS.remove("cfm");
-		this.SUPPORTED_EXTENSIONS.remove("asp");
-		this.SUPPORTED_EXTENSIONS.remove("aspx");
-		this.SUPPORTED_EXTENSIONS.remove("tex");
-		this.SUPPORTED_EXTENSIONS.remove("txt");
-		this.SUPPORTED_EXTENSIONS.remove("jsp");
-		this.SUPPORTED_EXTENSIONS.remove("mf");
-		this.SUPPORTED_EXTENSIONS.remove("pl");
-		this.SUPPORTED_EXTENSIONS.remove("py");
-		this.SUPPORTED_MIME_TYPES.remove("text/html");
-		this.SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
-		this.SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
-		this.SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
-		this.SUPPORTED_MIME_TYPES.remove("application/x-tex");
-		this.SUPPORTED_MIME_TYPES.remove("text/plain");
-		this.SUPPORTED_MIME_TYPES.remove("text/sgml");
-		this.SUPPORTED_MIME_TYPES.remove("text/csv");
+		System.out.println("augmented parser was initialized");

 		this.SUPPORTED_EXTENSIONS.add("html");
 		this.SUPPORTED_EXTENSIONS.add("php");
@ -59,7 +38,7 @@ public class AugmentParser extends RDFaParser {
 			String charset, InputStream source) throws Failure,
 			InterruptedException {

-		Document[] htmlDocs = super.parse(url, mimeType, charset, source);
+		Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);
 		try {
 			source.reset();
 		} catch (IOException e) {
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -53,8 +53,8 @@ public class htmlParser extends AbstractParser implements Parser {

    private static final Pattern patternUnderline = Pattern.compile("_");

-    public htmlParser(String name) {
-        super(name);
+    public htmlParser() {
+        super("Streaming HTML Parser");
        this.SUPPORTED_EXTENSIONS.add("htm");
        this.SUPPORTED_EXTENSIONS.add("html");
        this.SUPPORTED_EXTENSIONS.add("phtml");
@ -299,7 +299,7 @@ public class htmlParser extends AbstractParser implements Parser {
        try {
            url = new DigestURI(args[0]);
            final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
-            final Document[] document = new htmlParser("HTML Parser").parse(url, "text/html", null, new ByteArrayInputStream(content));
+            final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
            final String title = document[0].dc_title();
            System.out.println(title);
            System.out.println(CharacterCoding.unicode2html(title, false));
--- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
+++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
@ -3,13 +3,21 @@
 */
 package net.yacy.document.parser.rdfa.impl;

+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.Reader;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.HashSet;
 import java.util.Set;

+import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
+import net.yacy.document.Parser;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.rdfa.IRDFaTriple;
 import net.yacy.kelondro.data.meta.DigestURI;
@ -19,35 +27,13 @@ import net.yacy.kelondro.logging.Log;
 * @author fgandon
 *
 */
-public class RDFaParser extends htmlParser {
-
-	public RDFaParser(String name) {
-		super(name);
-		this.SUPPORTED_EXTENSIONS.remove("htm");
-		this.SUPPORTED_EXTENSIONS.remove("html");
-		this.SUPPORTED_EXTENSIONS.remove("shtml");
-		this.SUPPORTED_EXTENSIONS.remove("xhtml");
-		this.SUPPORTED_EXTENSIONS.remove("php");
-		this.SUPPORTED_EXTENSIONS.remove("php3");
-		this.SUPPORTED_EXTENSIONS.remove("php4");
-		this.SUPPORTED_EXTENSIONS.remove("php5");
-		this.SUPPORTED_EXTENSIONS.remove("cfm");
-		this.SUPPORTED_EXTENSIONS.remove("asp");
-		this.SUPPORTED_EXTENSIONS.remove("aspx");
-		this.SUPPORTED_EXTENSIONS.remove("tex");
-		this.SUPPORTED_EXTENSIONS.remove("txt");
-		this.SUPPORTED_EXTENSIONS.remove("jsp");
-		this.SUPPORTED_EXTENSIONS.remove("mf");
-		this.SUPPORTED_EXTENSIONS.remove("pl");
-		this.SUPPORTED_EXTENSIONS.remove("py");
-		this.SUPPORTED_MIME_TYPES.remove("text/html");
-		this.SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
-		this.SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
-		this.SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
-		this.SUPPORTED_MIME_TYPES.remove("application/x-tex");
-		this.SUPPORTED_MIME_TYPES.remove("text/plain");
-		this.SUPPORTED_MIME_TYPES.remove("text/sgml");
-		this.SUPPORTED_MIME_TYPES.remove("text/csv");
+public class RDFaParser extends AbstractParser implements Parser {
+
+    private final htmlParser hp;
+
+	public RDFaParser() {
+		super("RDFa Parser");
+		this.hp = new htmlParser();

 		this.SUPPORTED_EXTENSIONS.add("html");
 		this.SUPPORTED_EXTENSIONS.add("php");
@ -116,7 +102,7 @@ public class RDFaParser extends htmlParser {

 		Document[] htmlDocs = null;
 		try {
-			htmlDocs = super.parse(url, mimeType, charset, source);
+			htmlDocs = this.hp.parse(url, mimeType, charset, source);
 			source.reset();

 		} catch (IOException e1) {
@ -129,9 +115,9 @@ public class RDFaParser extends htmlParser {
 	private Document convertAllTriplesToDocument(DigestURI url,
 			String mimeType, String charset, IRDFaTriple[] allTriples) {

-		Set<String> languages = new HashSet<String>(2);
+		//Set<String> languages = new HashSet<String>(2);
 		Set<String> keywords = new HashSet<String>(allTriples.length);
-		Set<String> sections = new HashSet<String>(5);
+		//Set<String> sections = new HashSet<String>(5);
 		String all = "";

 		for (IRDFaTriple irdFaTriple : allTriples) {
@ -166,4 +152,51 @@ public class RDFaParser extends htmlParser {
 		}
 	}

+	public static void main(String[] args) {
+        URL aURL = null;
+        if (args.length < 1) {
+            System.out
+                    .println("Usage: one and only one argument giving a file path or a URL.");
+        } else {
+            File aFile = new File(args[0]);
+            Reader aReader = null;
+            if (aFile.exists()) {
+                try {
+                    aReader = new FileReader(aFile);
+                } catch (FileNotFoundException e) {
+                    aReader = null;
+                }
+            } else {
+                try {
+                    aURL = new URL(args[0]);
+                    aReader = new InputStreamReader(aURL.openStream());
+                } catch (MalformedURLException e) {
+                } catch (IOException e) {
+                    e.printStackTrace();
+                    aReader = null;
+                }
+
+            }
+
+            if (aReader != null) {
+                RDFaParser aParser = new RDFaParser();
+                try {
+                    aParser.parse(new DigestURI(args[0]),"","",aURL.openStream());
+                } catch (FileNotFoundException e) {
+                    e.printStackTrace();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                } catch (Failure e) {
+                    // TODO Auto-generated catch block
+                    e.printStackTrace();
+                } catch (InterruptedException e) {
+                    // TODO Auto-generated catch block
+                    e.printStackTrace();
+                }
+            } else
+                System.out.println("File or URL not recognized.");
+
+        }
+
+    }
 }
--- a/source/net/yacy/document/parser/rdfa/main.java
+++ b/source/net/yacy/document/parser/rdfa/main.java
@ -1,67 +0,0 @@
-package net.yacy.document.parser.rdfa;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.net.MalformedURLException;
-import java.net.URL;
-
-import net.yacy.document.Parser.Failure;
-import net.yacy.document.parser.rdfa.impl.RDFaParser;
-import net.yacy.kelondro.data.meta.DigestURI;
-
-public class main {
-	/**
-	 * @param args
-	 */
-	public static void main(String[] args) {
-		URL aURL = null;
-		if (args.length < 1) {
-			System.out
-					.println("Usage: one and only one argument giving a file path or a URL.");
-		} else {
-			File aFile = new File(args[0]);
-			Reader aReader = null;
-			if (aFile.exists()) {
-				try {
-					aReader = new FileReader(aFile);
-				} catch (FileNotFoundException e) {
-					aReader = null;
-				}
-			} else {
-				try {
-					aURL = new URL(args[0]);
-					aReader = new InputStreamReader(aURL.openStream());
-				} catch (MalformedURLException e) {
-				} catch (IOException e) {
-					e.printStackTrace();
-					aReader = null;
-				}
-
-			}
-
-			if (aReader != null) {
-				RDFaParser aParser = new RDFaParser("html");
-				try {
-					aParser.parse(new DigestURI(args[0]),"","",aURL.openStream());
-				} catch (FileNotFoundException e) {
-					e.printStackTrace();
-				} catch (IOException e) {
-					e.printStackTrace();
-				} catch (Failure e) {
-					// TODO Auto-generated catch block
-					e.printStackTrace();
-				} catch (InterruptedException e) {
-					// TODO Auto-generated catch block
-					e.printStackTrace();
-				}
-			} else
-				System.out.println("File or URL not recognized.");
-
-		}
-
-	}
-}