From 50c576599b0773f383fa9e2bb9a9563e47446deb Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 12 Jun 2012 01:42:58 +0200 Subject: [PATCH] allow multiple parser options instead of printing an error --- source/net/yacy/document/TextParser.java | 66 +++++++------ .../parser/augment/AugmentParser.java | 39 ++------ .../net/yacy/document/parser/htmlParser.java | 6 +- .../document/parser/rdfa/impl/RDFaParser.java | 99 ++++++++++++------- .../net/yacy/document/parser/rdfa/main.java | 67 ------------- 5 files changed, 113 insertions(+), 164 deletions(-) delete mode 100644 source/net/yacy/document/parser/rdfa/main.java diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index cada8a776..3dec938ba 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -26,10 +26,8 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; -import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; @@ -75,8 +73,8 @@ public final class TextParser { private static final Object v = new Object(); private static final Parser genericIdiom = new genericParser(); - private static final Map mime2parser = new ConcurrentHashMap(); - private static final Map ext2parser = new ConcurrentHashMap(); + private static final Map> mime2parser = new ConcurrentHashMap>(); + private static final Map> ext2parser = new ConcurrentHashMap>(); private static final Map ext2mime = new ConcurrentHashMap(); private static final Map denyMime = new ConcurrentHashMap(); private static final Map denyExtensionx = new ConcurrentHashMap(); @@ -86,7 +84,7 @@ public final class TextParser { initParser(new csvParser()); initParser(new docParser()); initParser(new gzipParser()); - initParser(new htmlParser("HTML Parser")); + initParser(new htmlParser()); initParser(new genericImageParser()); initParser(new mmParser()); initParser(new odtParser()); @@ -105,17 +103,17 @@ public final class TextParser { initParser(new vsdParser()); initParser(new xlsParser()); initParser(new zipParser()); - initParser(new RDFaParser("RDFa Parser")); + initParser(new RDFaParser()); initParser(new rdfParser()); - if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation.RDFa", true)) initParser(new RDFaParser("RDFa Parser")); - if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation", true)) initParser(new AugmentParser("Augment Parser")); + if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation.RDFa", true)) initParser(new RDFaParser()); + if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation", true)) initParser(new AugmentParser()); } public static Set parsers() { final Set c = new HashSet(); - c.addAll(ext2parser.values()); - c.addAll(mime2parser.values()); + for (Set pl: ext2parser.values()) c.addAll(pl); + for (Set pl: mime2parser.values()) c.addAll(pl); return c; } @@ -125,25 +123,31 @@ public final class TextParser { // process the mime types final String mimeType = normalizeMimeType(mime); if (prototypeMime == null) prototypeMime = mimeType; - final Parser p0 = mime2parser.get(mimeType); - if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); - mime2parser.put(mimeType, parser); + Set p0 = mime2parser.get(mimeType); + if (p0 == null) { + p0 = new HashSet(); + mime2parser.put(mimeType, p0); + } + p0.add(parser); Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName()); } if (prototypeMime != null) for (String ext: parser.supportedExtensions()) { ext = ext.toLowerCase(); final String s = ext2mime.get(ext); - if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'."); + if (s != null && !s.equals(prototypeMime)) log.logWarning("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'."); ext2mime.put(ext, prototypeMime); } for (String ext: parser.supportedExtensions()) { // process the extensions ext = ext.toLowerCase(); - final Parser p0 = ext2parser.get(ext); - if (p0 != null) log.logSevere("parser for extension '" + ext + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); - ext2parser.put(ext, parser); + Set p0 = ext2parser.get(ext); + if (p0 == null) { + p0 = new HashSet(); + ext2parser.put(ext, p0); + } + p0.add(parser); Log.logInfo("PARSER", "Parser for extension '" + ext + "': " + parser.getName()); } } @@ -187,7 +191,7 @@ public final class TextParser { ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array"); mimeType = normalizeMimeType(mimeType); - List idioms = null; + Set idioms = null; try { idioms = parsers(location, mimeType); } catch (final Parser.Failure e) { @@ -211,7 +215,7 @@ public final class TextParser { ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); mimeType = normalizeMimeType(mimeType); - List idioms = null; + Set idioms = null; try { idioms = parsers(location, mimeType); } catch (final Parser.Failure e) { @@ -225,7 +229,7 @@ public final class TextParser { // then we use only one stream-oriented parser. if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { // use a specific stream-oriented parser - return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream); + return parseSource(location, mimeType, idioms.iterator().next(), charset, contentLength, sourceStream); } // in case that we know more parsers we first transform the content into a byte[] and use that as base @@ -267,7 +271,7 @@ public final class TextParser { private static Document[] parseSource( final DigestURI location, final String mimeType, - final List parsers, + final Set parsers, final String charset, final byte[] sourceArray ) throws Parser.Failure { @@ -334,8 +338,8 @@ public final class TextParser { public static String supports(final MultiProtocolURI url, final String mimeType) { try { // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok. - final List idioms = parsers(url, mimeType); - return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.get(0).getName().equals(genericIdiom.getName()))) ? "no parser found" : null; + final Set idioms = parsers(url, mimeType); + return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.iterator().next().getName().equals(genericIdiom.getName()))) ? "no parser found" : null; } catch (final Parser.Failure e) { // in case that a parser is not available, return a error string describing the problem. return e.getMessage(); @@ -355,17 +359,17 @@ public final class TextParser { * @return a list of Idiom parsers that may be appropriate for the given criteria * @throws Parser.Failure */ - private static List parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure { - final List idioms = new ArrayList(2); + private static Set parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure { + final Set idioms = new HashSet(2); // check extension String ext = url.getFileExtension(); - Parser idiom; + Set idiom; if (ext != null && ext.length() > 0) { ext = ext.toLowerCase(); if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url); idiom = ext2parser.get(ext); - if (idiom != null) idioms.add(idiom); + if (idiom != null) idioms.addAll(idiom); } // check given mime type @@ -373,12 +377,12 @@ public final class TextParser { mimeType1 = normalizeMimeType(mimeType1); if (denyMime.containsKey(mimeType1)) throw new Parser.Failure("mime type '" + mimeType1 + "' is denied (1)", url); idiom = mime2parser.get(mimeType1); - if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom); + if (idiom != null && !idioms.contains(idiom)) idioms.addAll(idiom); } // check mime type computed from extension final String mimeType2 = ext2mime.get(ext); - if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.add(idiom); + if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.addAll(idiom); // always add the generic parser idioms.add(genericIdiom); @@ -412,9 +416,9 @@ public final class TextParser { if (denyExtensionx.containsKey(ext)) return "file extension '" + ext + "' is denied (2)"; final String mimeType = ext2mime.get(ext); if (mimeType == null) return "no parser available"; - final Parser idiom = mime2parser.get(mimeType); + final Set idiom = mime2parser.get(mimeType); assert idiom != null; - if (idiom == null) return "no parser available (internal error!)"; + if (idiom == null || idiom.size() == 0) return "no parser available (internal error!)"; return null; } diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java index 55e96858e..1eb8a5361 100644 --- a/source/net/yacy/document/parser/augment/AugmentParser.java +++ b/source/net/yacy/document/parser/augment/AugmentParser.java @@ -6,45 +6,24 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Set; +import net.yacy.document.AbstractParser; import net.yacy.document.Document; +import net.yacy.document.Parser; import net.yacy.document.parser.rdfa.impl.RDFaParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import de.anomic.data.ymark.YMarkUtil; -public class AugmentParser extends RDFaParser { +public class AugmentParser extends AbstractParser implements Parser { - public AugmentParser(String name) { - super(name); + RDFaParser rdfaParser; - System.out.println("augmented parser was initialized"); + public AugmentParser() { + super("AugmentParser"); + this.rdfaParser = new RDFaParser(); - this.SUPPORTED_EXTENSIONS.remove("htm"); - this.SUPPORTED_EXTENSIONS.remove("html"); - this.SUPPORTED_EXTENSIONS.remove("shtml"); - this.SUPPORTED_EXTENSIONS.remove("xhtml"); - this.SUPPORTED_EXTENSIONS.remove("php"); - this.SUPPORTED_EXTENSIONS.remove("php3"); - this.SUPPORTED_EXTENSIONS.remove("php4"); - this.SUPPORTED_EXTENSIONS.remove("php5"); - this.SUPPORTED_EXTENSIONS.remove("cfm"); - this.SUPPORTED_EXTENSIONS.remove("asp"); - this.SUPPORTED_EXTENSIONS.remove("aspx"); - this.SUPPORTED_EXTENSIONS.remove("tex"); - this.SUPPORTED_EXTENSIONS.remove("txt"); - this.SUPPORTED_EXTENSIONS.remove("jsp"); - this.SUPPORTED_EXTENSIONS.remove("mf"); - this.SUPPORTED_EXTENSIONS.remove("pl"); - this.SUPPORTED_EXTENSIONS.remove("py"); - this.SUPPORTED_MIME_TYPES.remove("text/html"); - this.SUPPORTED_MIME_TYPES.remove("text/xhtml+xml"); - this.SUPPORTED_MIME_TYPES.remove("application/xhtml+xml"); - this.SUPPORTED_MIME_TYPES.remove("application/x-httpd-php"); - this.SUPPORTED_MIME_TYPES.remove("application/x-tex"); - this.SUPPORTED_MIME_TYPES.remove("text/plain"); - this.SUPPORTED_MIME_TYPES.remove("text/sgml"); - this.SUPPORTED_MIME_TYPES.remove("text/csv"); + System.out.println("augmented parser was initialized"); this.SUPPORTED_EXTENSIONS.add("html"); this.SUPPORTED_EXTENSIONS.add("php"); @@ -59,7 +38,7 @@ public class AugmentParser extends RDFaParser { String charset, InputStream source) throws Failure, InterruptedException { - Document[] htmlDocs = super.parse(url, mimeType, charset, source); + Document[] htmlDocs = this.rdfaParser.parse(url, mimeType, charset, source); try { source.reset(); } catch (IOException e) { diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 77918959e..0d5f7b9ae 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -53,8 +53,8 @@ public class htmlParser extends AbstractParser implements Parser { private static final Pattern patternUnderline = Pattern.compile("_"); - public htmlParser(String name) { - super(name); + public htmlParser() { + super("Streaming HTML Parser"); this.SUPPORTED_EXTENSIONS.add("htm"); this.SUPPORTED_EXTENSIONS.add("html"); this.SUPPORTED_EXTENSIONS.add("phtml"); @@ -299,7 +299,7 @@ public class htmlParser extends AbstractParser implements Parser { try { url = new DigestURI(args[0]); final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000); - final Document[] document = new htmlParser("HTML Parser").parse(url, "text/html", null, new ByteArrayInputStream(content)); + final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content)); final String title = document[0].dc_title(); System.out.println(title); System.out.println(CharacterCoding.unicode2html(title, false)); diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java index 6482ca59a..0b3b9c09c 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java @@ -3,13 +3,21 @@ */ package net.yacy.document.parser.rdfa.impl; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.Reader; +import java.net.MalformedURLException; +import java.net.URL; import java.util.HashSet; import java.util.Set; +import net.yacy.document.AbstractParser; import net.yacy.document.Document; +import net.yacy.document.Parser; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.rdfa.IRDFaTriple; import net.yacy.kelondro.data.meta.DigestURI; @@ -19,35 +27,13 @@ import net.yacy.kelondro.logging.Log; * @author fgandon * */ -public class RDFaParser extends htmlParser { - - public RDFaParser(String name) { - super(name); - this.SUPPORTED_EXTENSIONS.remove("htm"); - this.SUPPORTED_EXTENSIONS.remove("html"); - this.SUPPORTED_EXTENSIONS.remove("shtml"); - this.SUPPORTED_EXTENSIONS.remove("xhtml"); - this.SUPPORTED_EXTENSIONS.remove("php"); - this.SUPPORTED_EXTENSIONS.remove("php3"); - this.SUPPORTED_EXTENSIONS.remove("php4"); - this.SUPPORTED_EXTENSIONS.remove("php5"); - this.SUPPORTED_EXTENSIONS.remove("cfm"); - this.SUPPORTED_EXTENSIONS.remove("asp"); - this.SUPPORTED_EXTENSIONS.remove("aspx"); - this.SUPPORTED_EXTENSIONS.remove("tex"); - this.SUPPORTED_EXTENSIONS.remove("txt"); - this.SUPPORTED_EXTENSIONS.remove("jsp"); - this.SUPPORTED_EXTENSIONS.remove("mf"); - this.SUPPORTED_EXTENSIONS.remove("pl"); - this.SUPPORTED_EXTENSIONS.remove("py"); - this.SUPPORTED_MIME_TYPES.remove("text/html"); - this.SUPPORTED_MIME_TYPES.remove("text/xhtml+xml"); - this.SUPPORTED_MIME_TYPES.remove("application/xhtml+xml"); - this.SUPPORTED_MIME_TYPES.remove("application/x-httpd-php"); - this.SUPPORTED_MIME_TYPES.remove("application/x-tex"); - this.SUPPORTED_MIME_TYPES.remove("text/plain"); - this.SUPPORTED_MIME_TYPES.remove("text/sgml"); - this.SUPPORTED_MIME_TYPES.remove("text/csv"); +public class RDFaParser extends AbstractParser implements Parser { + + private final htmlParser hp; + + public RDFaParser() { + super("RDFa Parser"); + this.hp = new htmlParser(); this.SUPPORTED_EXTENSIONS.add("html"); this.SUPPORTED_EXTENSIONS.add("php"); @@ -58,7 +44,7 @@ public class RDFaParser extends htmlParser { } @Override - public Document[] parse(DigestURI url, String mimeType, + public Document[] parse(DigestURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { @@ -116,7 +102,7 @@ public class RDFaParser extends htmlParser { Document[] htmlDocs = null; try { - htmlDocs = super.parse(url, mimeType, charset, source); + htmlDocs = this.hp.parse(url, mimeType, charset, source); source.reset(); } catch (IOException e1) { @@ -129,9 +115,9 @@ public class RDFaParser extends htmlParser { private Document convertAllTriplesToDocument(DigestURI url, String mimeType, String charset, IRDFaTriple[] allTriples) { - Set languages = new HashSet(2); + //Set languages = new HashSet(2); Set keywords = new HashSet(allTriples.length); - Set sections = new HashSet(5); + //Set sections = new HashSet(5); String all = ""; for (IRDFaTriple irdFaTriple : allTriples) { @@ -166,4 +152,51 @@ public class RDFaParser extends htmlParser { } } + public static void main(String[] args) { + URL aURL = null; + if (args.length < 1) { + System.out + .println("Usage: one and only one argument giving a file path or a URL."); + } else { + File aFile = new File(args[0]); + Reader aReader = null; + if (aFile.exists()) { + try { + aReader = new FileReader(aFile); + } catch (FileNotFoundException e) { + aReader = null; + } + } else { + try { + aURL = new URL(args[0]); + aReader = new InputStreamReader(aURL.openStream()); + } catch (MalformedURLException e) { + } catch (IOException e) { + e.printStackTrace(); + aReader = null; + } + + } + + if (aReader != null) { + RDFaParser aParser = new RDFaParser(); + try { + aParser.parse(new DigestURI(args[0]),"","",aURL.openStream()); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (Failure e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } else + System.out.println("File or URL not recognized."); + + } + + } } diff --git a/source/net/yacy/document/parser/rdfa/main.java b/source/net/yacy/document/parser/rdfa/main.java deleted file mode 100644 index 50c8d2b19..000000000 --- a/source/net/yacy/document/parser/rdfa/main.java +++ /dev/null @@ -1,67 +0,0 @@ -package net.yacy.document.parser.rdfa; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.Reader; -import java.net.MalformedURLException; -import java.net.URL; - -import net.yacy.document.Parser.Failure; -import net.yacy.document.parser.rdfa.impl.RDFaParser; -import net.yacy.kelondro.data.meta.DigestURI; - -public class main { - /** - * @param args - */ - public static void main(String[] args) { - URL aURL = null; - if (args.length < 1) { - System.out - .println("Usage: one and only one argument giving a file path or a URL."); - } else { - File aFile = new File(args[0]); - Reader aReader = null; - if (aFile.exists()) { - try { - aReader = new FileReader(aFile); - } catch (FileNotFoundException e) { - aReader = null; - } - } else { - try { - aURL = new URL(args[0]); - aReader = new InputStreamReader(aURL.openStream()); - } catch (MalformedURLException e) { - } catch (IOException e) { - e.printStackTrace(); - aReader = null; - } - - } - - if (aReader != null) { - RDFaParser aParser = new RDFaParser("html"); - try { - aParser.parse(new DigestURI(args[0]),"","",aURL.openStream()); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } catch (Failure e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (InterruptedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } else - System.out.println("File or URL not recognized."); - - } - - } -}