From 50c576599b0773f383fa9e2bb9a9563e47446deb Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Tue, 12 Jun 2012 01:42:58 +0200
Subject: [PATCH] allow multiple parser options instead of printing an error

---
 source/net/yacy/document/TextParser.java      | 66 +++++++------
 .../parser/augment/AugmentParser.java         | 39 ++------
 .../net/yacy/document/parser/htmlParser.java  |  6 +-
 .../document/parser/rdfa/impl/RDFaParser.java | 99 ++++++++++++-------
 .../net/yacy/document/parser/rdfa/main.java   | 67 -------------
 5 files changed, 113 insertions(+), 164 deletions(-)
 delete mode 100644 source/net/yacy/document/parser/rdfa/main.java
diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java
index cada8a776..3dec938ba 100644
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@@ -26,10 +26,8 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
@@ -75,8 +73,8 @@ public final class TextParser {
     private static final Object v = new Object();
 
     private static final Parser genericIdiom = new genericParser();
-    private static final Map<String, Parser> mime2parser = new ConcurrentHashMap<String, Parser>();
-    private static final Map<String, Parser> ext2parser = new ConcurrentHashMap<String, Parser>();
+    private static final Map<String, Set<Parser>> mime2parser = new ConcurrentHashMap<String, Set<Parser>>();
+    private static final Map<String, Set<Parser>> ext2parser = new ConcurrentHashMap<String, Set<Parser>>();
     private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>();
     private static final Map<String, Object> denyMime = new ConcurrentHashMap<String, Object>();
     private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<String, Object>();
@@ -86,7 +84,7 @@ public final class TextParser {
         initParser(new csvParser());
         initParser(new docParser());
         initParser(new gzipParser());
-        initParser(new htmlParser("HTML Parser"));
+        initParser(new htmlParser());
         initParser(new genericImageParser());
         initParser(new mmParser());
         initParser(new odtParser());
@@ -105,17 +103,17 @@ public final class TextParser {
         initParser(new vsdParser());
         initParser(new xlsParser());
         initParser(new zipParser());
-        initParser(new RDFaParser("RDFa Parser"));
+        initParser(new RDFaParser());
         initParser(new rdfParser());
 
-        if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation.RDFa", true)) initParser(new RDFaParser("RDFa Parser"));
-        if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation", true)) initParser(new AugmentParser("Augment Parser"));
+        if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation.RDFa", true)) initParser(new RDFaParser());
+        if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation", true)) initParser(new AugmentParser());
     }
 
     public static Set<Parser> parsers() {
         final Set<Parser> c = new HashSet<Parser>();
-        c.addAll(ext2parser.values());
-        c.addAll(mime2parser.values());
+        for (Set<Parser> pl: ext2parser.values()) c.addAll(pl);
+        for (Set<Parser> pl: mime2parser.values()) c.addAll(pl);
         return c;
     }
 
@@ -125,25 +123,31 @@ public final class TextParser {
             // process the mime types
             final String mimeType = normalizeMimeType(mime);
             if (prototypeMime == null) prototypeMime = mimeType;
-            final Parser p0 = mime2parser.get(mimeType);
-            if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
-            mime2parser.put(mimeType, parser);
+            Set<Parser> p0 = mime2parser.get(mimeType);
+            if (p0 == null) {
+                p0 = new HashSet<Parser>();
+                mime2parser.put(mimeType, p0);
+            }
+            p0.add(parser);
             Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());
         }
 
         if (prototypeMime != null) for (String ext: parser.supportedExtensions()) {
             ext = ext.toLowerCase();
             final String s = ext2mime.get(ext);
-            if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
+            if (s != null && !s.equals(prototypeMime)) log.logWarning("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
             ext2mime.put(ext, prototypeMime);
         }
 
         for (String ext: parser.supportedExtensions()) {
             // process the extensions
             ext = ext.toLowerCase();
-            final Parser p0 = ext2parser.get(ext);
-            if (p0 != null) log.logSevere("parser for extension '" + ext + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
-            ext2parser.put(ext, parser);
+            Set<Parser> p0 = ext2parser.get(ext);
+            if (p0 == null) {
+                p0 = new HashSet<Parser>();
+                ext2parser.put(ext, p0);
+            }
+            p0.add(parser);
             Log.logInfo("PARSER", "Parser for extension '" + ext + "': " + parser.getName());
         }
     }
@@ -187,7 +191,7 @@ public final class TextParser {
         ) throws Parser.Failure {
         if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
         mimeType = normalizeMimeType(mimeType);
-        List<Parser> idioms = null;
+        Set<Parser> idioms = null;
         try {
             idioms = parsers(location, mimeType);
         } catch (final Parser.Failure e) {
@@ -211,7 +215,7 @@ public final class TextParser {
         ) throws Parser.Failure {
         if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
         mimeType = normalizeMimeType(mimeType);
-        List<Parser> idioms = null;
+        Set<Parser> idioms = null;
         try {
             idioms = parsers(location, mimeType);
         } catch (final Parser.Failure e) {
@@ -225,7 +229,7 @@ public final class TextParser {
         // then we use only one stream-oriented parser.
         if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
             // use a specific stream-oriented parser
-            return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
+            return parseSource(location, mimeType, idioms.iterator().next(), charset, contentLength, sourceStream);
         }
 
         // in case that we know more parsers we first transform the content into a byte[] and use that as base
@@ -267,7 +271,7 @@ public final class TextParser {
     private static Document[] parseSource(
             final DigestURI location,
             final String mimeType,
-            final List<Parser> parsers,
+            final Set<Parser> parsers,
             final String charset,
             final byte[] sourceArray
         ) throws Parser.Failure {
@@ -334,8 +338,8 @@ public final class TextParser {
     public static String supports(final MultiProtocolURI url, final String mimeType) {
         try {
             // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
-            final List<Parser> idioms = parsers(url, mimeType);
-            return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.get(0).getName().equals(genericIdiom.getName()))) ? "no parser found" : null;
+            final Set<Parser> idioms = parsers(url, mimeType);
+            return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.iterator().next().getName().equals(genericIdiom.getName()))) ? "no parser found" : null;
         } catch (final Parser.Failure e) {
             // in case that a parser is not available, return a error string describing the problem.
             return e.getMessage();
@@ -355,17 +359,17 @@ public final class TextParser {
      * @return a list of Idiom parsers that may be appropriate for the given criteria
      * @throws Parser.Failure
      */
-    private static List<Parser> parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure {
-        final List<Parser> idioms = new ArrayList<Parser>(2);
+    private static Set<Parser> parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure {
+        final Set<Parser> idioms = new HashSet<Parser>(2);
 
         // check extension
         String ext = url.getFileExtension();
-        Parser idiom;
+        Set<Parser> idiom;
         if (ext != null && ext.length() > 0) {
             ext = ext.toLowerCase();
             if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
             idiom = ext2parser.get(ext);
-            if (idiom != null) idioms.add(idiom);
+            if (idiom != null) idioms.addAll(idiom);
         }
 
         // check given mime type
@@ -373,12 +377,12 @@ public final class TextParser {
             mimeType1 = normalizeMimeType(mimeType1);
             if (denyMime.containsKey(mimeType1)) throw new Parser.Failure("mime type '" + mimeType1 + "' is denied (1)", url);
             idiom = mime2parser.get(mimeType1);
-            if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom);
+            if (idiom != null && !idioms.contains(idiom)) idioms.addAll(idiom);
         }
 
         // check mime type computed from extension
         final String mimeType2 = ext2mime.get(ext);
-        if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.add(idiom);
+        if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.addAll(idiom);
 
         // always add the generic parser
         idioms.add(genericIdiom);
@@ -412,9 +416,9 @@ public final class TextParser {
         if (denyExtensionx.containsKey(ext)) return "file extension '" + ext + "' is denied (2)";
         final String mimeType = ext2mime.get(ext);
         if (mimeType == null) return "no parser available";
-        final Parser idiom = mime2parser.get(mimeType);
+        final Set<Parser> idiom = mime2parser.get(mimeType);
         assert idiom != null;
-        if (idiom == null) return "no parser available (internal error!)";
+        if (idiom == null || idiom.size() == 0) return "no parser available (internal error!)";
         return null;
     }
 
diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java
index 55e96858e..1eb8a5361 100644
--- a/source/net/yacy/document/parser/augment/AugmentParser.java
+++ b/source/net/yacy/document/parser/augment/AugmentParser.java
@@ -6,45 +6,24 @@ import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Set;
 
+import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
+import net.yacy.document.Parser;
 import net.yacy.document.parser.rdfa.impl.RDFaParser;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.search.Switchboard;
 import de.anomic.data.ymark.YMarkUtil;
 
 
-public class AugmentParser extends RDFaParser {
+public class AugmentParser extends AbstractParser implements Parser {
 
-	public AugmentParser(String name) {
-		super(name);
+    RDFaParser rdfaParser;
 
-		System.out.println("augmented parser was initialized");
+	public AugmentParser() {
+		super("AugmentParser");
+		this.rdfaParser = new RDFaParser();
 
-		this.SUPPORTED_EXTENSIONS.remove("htm");
-		this.SUPPORTED_EXTENSIONS.remove("html");
-		this.SUPPORTED_EXTENSIONS.remove("shtml");
-		this.SUPPORTED_EXTENSIONS.remove("xhtml");
-		this.SUPPORTED_EXTENSIONS.remove("php");
-		this.SUPPORTED_EXTENSIONS.remove("php3");
-		this.SUPPORTED_EXTENSIONS.remove("php4");
-		this.SUPPORTED_EXTENSIONS.remove("php5");
-		this.SUPPORTED_EXTENSIONS.remove("cfm");
-		this.SUPPORTED_EXTENSIONS.remove("asp");
-		this.SUPPORTED_EXTENSIONS.remove("aspx");
-		this.SUPPORTED_EXTENSIONS.remove("tex");
-		this.SUPPORTED_EXTENSIONS.remove("txt");
-		this.SUPPORTED_EXTENSIONS.remove("jsp");
-		this.SUPPORTED_EXTENSIONS.remove("mf");
-		this.SUPPORTED_EXTENSIONS.remove("pl");
-		this.SUPPORTED_EXTENSIONS.remove("py");
-		this.SUPPORTED_MIME_TYPES.remove("text/html");
-		this.SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
-		this.SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
-		this.SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
-		this.SUPPORTED_MIME_TYPES.remove("application/x-tex");
-		this.SUPPORTED_MIME_TYPES.remove("text/plain");
-		this.SUPPORTED_MIME_TYPES.remove("text/sgml");
-		this.SUPPORTED_MIME_TYPES.remove("text/csv");
+		System.out.println("augmented parser was initialized");
 
 		this.SUPPORTED_EXTENSIONS.add("html");
 		this.SUPPORTED_EXTENSIONS.add("php");
@@ -59,7 +38,7 @@ public class AugmentParser extends RDFaParser {
 			String charset, InputStream source) throws Failure,
 			InterruptedException {
 
-		Document[] htmlDocs = super.parse(url, mimeType, charset, source);
+		Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source);
 		try {
 			source.reset();
 		} catch (IOException e) {
diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java
index 77918959e..0d5f7b9ae 100644
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@@ -53,8 +53,8 @@ public class htmlParser extends AbstractParser implements Parser {
 
     private static final Pattern patternUnderline = Pattern.compile("_");
 
-    public htmlParser(String name) {
-        super(name);
+    public htmlParser() {
+        super("Streaming HTML Parser");
         this.SUPPORTED_EXTENSIONS.add("htm");
         this.SUPPORTED_EXTENSIONS.add("html");
         this.SUPPORTED_EXTENSIONS.add("phtml");
@@ -299,7 +299,7 @@ public class htmlParser extends AbstractParser implements Parser {
         try {
             url = new DigestURI(args[0]);
             final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
-            final Document[] document = new htmlParser("HTML Parser").parse(url, "text/html", null, new ByteArrayInputStream(content));
+            final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
             final String title = document[0].dc_title();
             System.out.println(title);
             System.out.println(CharacterCoding.unicode2html(title, false));
diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
index 6482ca59a..0b3b9c09c 100644
--- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
+++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
@@ -3,13 +3,21 @@
  */
 package net.yacy.document.parser.rdfa.impl;
 
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.Reader;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.HashSet;
 import java.util.Set;
 
+import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
+import net.yacy.document.Parser;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.rdfa.IRDFaTriple;
 import net.yacy.kelondro.data.meta.DigestURI;
@@ -19,35 +27,13 @@ import net.yacy.kelondro.logging.Log;
  * @author fgandon
  *
  */
-public class RDFaParser extends htmlParser {
-
-	public RDFaParser(String name) {
-		super(name);
-		this.SUPPORTED_EXTENSIONS.remove("htm");
-		this.SUPPORTED_EXTENSIONS.remove("html");
-		this.SUPPORTED_EXTENSIONS.remove("shtml");
-		this.SUPPORTED_EXTENSIONS.remove("xhtml");
-		this.SUPPORTED_EXTENSIONS.remove("php");
-		this.SUPPORTED_EXTENSIONS.remove("php3");
-		this.SUPPORTED_EXTENSIONS.remove("php4");
-		this.SUPPORTED_EXTENSIONS.remove("php5");
-		this.SUPPORTED_EXTENSIONS.remove("cfm");
-		this.SUPPORTED_EXTENSIONS.remove("asp");
-		this.SUPPORTED_EXTENSIONS.remove("aspx");
-		this.SUPPORTED_EXTENSIONS.remove("tex");
-		this.SUPPORTED_EXTENSIONS.remove("txt");
-		this.SUPPORTED_EXTENSIONS.remove("jsp");
-		this.SUPPORTED_EXTENSIONS.remove("mf");
-		this.SUPPORTED_EXTENSIONS.remove("pl");
-		this.SUPPORTED_EXTENSIONS.remove("py");
-		this.SUPPORTED_MIME_TYPES.remove("text/html");
-		this.SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
-		this.SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
-		this.SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
-		this.SUPPORTED_MIME_TYPES.remove("application/x-tex");
-		this.SUPPORTED_MIME_TYPES.remove("text/plain");
-		this.SUPPORTED_MIME_TYPES.remove("text/sgml");
-		this.SUPPORTED_MIME_TYPES.remove("text/csv");
+public class RDFaParser extends AbstractParser implements Parser {
+
+    private final htmlParser hp;
+
+	public RDFaParser() {
+		super("RDFa Parser");
+		this.hp = new htmlParser();
 
 		this.SUPPORTED_EXTENSIONS.add("html");
 		this.SUPPORTED_EXTENSIONS.add("php");
@@ -58,7 +44,7 @@ public class RDFaParser extends htmlParser {
 	}
 
 	@Override
-	public Document[] parse(DigestURI url, String mimeType,
+    public Document[] parse(DigestURI url, String mimeType,
 			String charset, InputStream source) throws Failure,
 			InterruptedException {
 
@@ -116,7 +102,7 @@ public class RDFaParser extends htmlParser {
 
 		Document[] htmlDocs = null;
 		try {
-			htmlDocs = super.parse(url, mimeType, charset, source);
+			htmlDocs = this.hp.parse(url, mimeType, charset, source);
 			source.reset();
 
 		} catch (IOException e1) {
@@ -129,9 +115,9 @@ public class RDFaParser extends htmlParser {
 	private Document convertAllTriplesToDocument(DigestURI url,
 			String mimeType, String charset, IRDFaTriple[] allTriples) {
 
-		Set<String> languages = new HashSet<String>(2);
+		//Set<String> languages = new HashSet<String>(2);
 		Set<String> keywords = new HashSet<String>(allTriples.length);
-		Set<String> sections = new HashSet<String>(5);
+		//Set<String> sections = new HashSet<String>(5);
 		String all = "";
 
 		for (IRDFaTriple irdFaTriple : allTriples) {
@@ -166,4 +152,51 @@ public class RDFaParser extends htmlParser {
 		}
 	}
 
+	public static void main(String[] args) {
+        URL aURL = null;
+        if (args.length < 1) {
+            System.out
+                    .println("Usage: one and only one argument giving a file path or a URL.");
+        } else {
+            File aFile = new File(args[0]);
+            Reader aReader = null;
+            if (aFile.exists()) {
+                try {
+                    aReader = new FileReader(aFile);
+                } catch (FileNotFoundException e) {
+                    aReader = null;
+                }
+            } else {
+                try {
+                    aURL = new URL(args[0]);
+                    aReader = new InputStreamReader(aURL.openStream());
+                } catch (MalformedURLException e) {
+                } catch (IOException e) {
+                    e.printStackTrace();
+                    aReader = null;
+                }
+
+            }
+
+            if (aReader != null) {
+                RDFaParser aParser = new RDFaParser();
+                try {
+                    aParser.parse(new DigestURI(args[0]),"","",aURL.openStream());
+                } catch (FileNotFoundException e) {
+                    e.printStackTrace();
+                } catch (IOException e) {
+                    e.printStackTrace();
+                } catch (Failure e) {
+                    // TODO Auto-generated catch block
+                    e.printStackTrace();
+                } catch (InterruptedException e) {
+                    // TODO Auto-generated catch block
+                    e.printStackTrace();
+                }
+            } else
+                System.out.println("File or URL not recognized.");
+
+        }
+
+    }
 }
diff --git a/source/net/yacy/document/parser/rdfa/main.java b/source/net/yacy/document/parser/rdfa/main.java
deleted file mode 100644
index 50c8d2b19..000000000
--- a/source/net/yacy/document/parser/rdfa/main.java
+++ /dev/null
@@ -1,67 +0,0 @@
-package net.yacy.document.parser.rdfa;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.net.MalformedURLException;
-import java.net.URL;
-
-import net.yacy.document.Parser.Failure;
-import net.yacy.document.parser.rdfa.impl.RDFaParser;
-import net.yacy.kelondro.data.meta.DigestURI;
-
-public class main {
-	/**
-	 * @param args
-	 */
-	public static void main(String[] args) {
-		URL aURL = null;
-		if (args.length < 1) {
-			System.out
-					.println("Usage: one and only one argument giving a file path or a URL.");
-		} else {
-			File aFile = new File(args[0]);
-			Reader aReader = null;
-			if (aFile.exists()) {
-				try {
-					aReader = new FileReader(aFile);
-				} catch (FileNotFoundException e) {
-					aReader = null;
-				}
-			} else {
-				try {
-					aURL = new URL(args[0]);
-					aReader = new InputStreamReader(aURL.openStream());
-				} catch (MalformedURLException e) {
-				} catch (IOException e) {
-					e.printStackTrace();
-					aReader = null;
-				}
-
-			}
-
-			if (aReader != null) {
-				RDFaParser aParser = new RDFaParser("html");
-				try {
-					aParser.parse(new DigestURI(args[0]),"","",aURL.openStream());
-				} catch (FileNotFoundException e) {
-					e.printStackTrace();
-				} catch (IOException e) {
-					e.printStackTrace();
-				} catch (Failure e) {
-					// TODO Auto-generated catch block
-					e.printStackTrace();
-				} catch (InterruptedException e) {
-					// TODO Auto-generated catch block
-					e.printStackTrace();
-				}
-			} else
-				System.out.println("File or URL not recognized.");
-
-		}
-
-	}
-}