added concurrency enhancement to xml parser

pull/1/head
Michael Peter Christen 13 years ago
parent cf79b6cee3
commit 964406ad17

@ -57,14 +57,27 @@ public class RSSReader extends DefaultHandler {
this.parsingItem = false;
this.type = Type.none;
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
public RSSReader(final int maxsize, InputStream stream, final Type type) throws IOException {
this(maxsize);
this.type = type;
if (!(stream instanceof ByteArrayInputStream) && !(stream instanceof BufferedInputStream)) stream = new BufferedInputStream(stream);
final SAXParserFactory factory = SAXParserFactory.newInstance();
try {
final SAXParser saxParser = factory.newSAXParser();
final SAXParser saxParser = getParser();
// do not look at external dtd - see: http://www.ibm.com/developerworks/xml/library/x-tipcfsx/index.html
saxParser.getXMLReader().setEntityResolver(new EntityResolver() {
@Override
@ -76,8 +89,6 @@ public class RSSReader extends DefaultHandler {
saxParser.parse(stream, this);
} catch (final SAXException e) {
throw new IOException (e.getMessage());
} catch (final ParserConfigurationException e) {
throw new IOException (e.getMessage());
}
}

@ -0,0 +1,240 @@
/**
* Vocabulary
* Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 07.01.2012 on http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.lod;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.storage.Files;
import net.yacy.document.WordCache.Dictionary;
import net.yacy.document.geolocalization.Localization;
public class SimpleVocabulary {
private final String navigatorName;
private final Map<String, String> synonym2term;
private final Map<String, String> term2synonym;
private final Map<String, Set<String>> synonym2synonyms;
public SimpleVocabulary(String name) {
this.navigatorName = name;
this.synonym2term = new ConcurrentHashMap<String, String>();
this.term2synonym = new ConcurrentHashMap<String, String>();
this.synonym2synonyms = new ConcurrentHashMap<String, Set<String>>();
}
public SimpleVocabulary(String name, File propFile) throws IOException {
this(name);
BlockingQueue<String> list = Files.concurentLineReader(propFile, 1000);
String term, v;
String[] tags;
int p;
String line;
try {
vocloop: while ((line = list.take()) != Files.POISON_LINE) {
line = line.trim();
p = line.indexOf('#');
if (p >= 0) {
line = line.substring(0, p).trim();
}
if (line.length() == 0) {
continue vocloop;
}
p = line.indexOf(':');
if (p < 0) {
p = line.indexOf('=');
}
if (p < 0) {
p = line.indexOf('\t');
}
if (p < 0) {
term = normalizeKey(line);
v = normalizeWord(line);
this.synonym2term.put(v, term);
this.term2synonym.put(term, v);
continue vocloop;
}
term = normalizeKey(line.substring(0, p));
v = line.substring(p + 1);
tags = v.split(",");
Set<String> synonyms = new HashSet<String>();
synonyms.add(term);
tagloop: for (String synonym: tags) {
if (synonym.length() == 0) continue tagloop;
synonyms.add(synonym);
synonym = normalizeWord(synonym);
if (synonym.length() == 0) continue tagloop;
synonyms.add(synonym);
this.synonym2term.put(synonym, term);
this.term2synonym.put(term, synonym);
}
String synonym = normalizeWord(term);
this.synonym2term.put(synonym, term);
this.term2synonym.put(term, synonym);
synonyms.add(synonym);
for (String s: synonyms) {
this.synonym2synonyms.put(s, synonyms);
}
}
} catch (InterruptedException e) {
}
}
private final String normalizeKey(String k) {
k = k.trim();
k = k.replaceAll(" \\+", ", "); // remove symbols that are bad in a query attribute
k = k.replaceAll(" /", ", ");
k = k.replaceAll("\\+", ",");
k = k.replaceAll("/", ",");
k = k.replaceAll(" ", " ");
return k;
}
public SimpleVocabulary(String name, Localization localization) {
this(name);
Set<String> locNames = localization.locationNames();
for (String loc: locNames) {
this.synonym2term.put(loc.toLowerCase(), loc);
this.term2synonym.put(loc, loc.toLowerCase());
}
}
public SimpleVocabulary(String name, Dictionary dictionary) {
this(name);
Set<StringBuilder> words = dictionary.getWords();
String s;
for (StringBuilder word: words) {
s = word.toString();
this.synonym2term.put(s.toLowerCase(), s);
this.term2synonym.put(s, s.toLowerCase());
}
}
public String getName() {
return this.navigatorName;
}
public Metatag getMetatag(char prefix, final String word) {
String printname = this.synonym2term.get(word);
if (printname == null) return null;
return new Metatag(prefix, this.navigatorName, printname);
}
public Set<String> getSynonyms(String term) {
return this.synonym2synonyms.get(term);
}
public Set<String> tags() {
return this.synonym2term.keySet();
}
@Override
public String toString() {
return this.term2synonym.toString();
}
private final static Pattern PATTERN_AE = Pattern.compile("\u00E4"); // german umlaute hack for better matching
private final static Pattern PATTERN_OE = Pattern.compile("\u00F6");
private final static Pattern PATTERN_UE = Pattern.compile("\u00FC");
private final static Pattern PATTERN_SZ = Pattern.compile("\u00DF");
public static final String normalizeWord(String word) {
word = word.trim().toLowerCase();
word = PATTERN_AE.matcher(word).replaceAll("ae");
word = PATTERN_OE.matcher(word).replaceAll("oe");
word = PATTERN_UE.matcher(word).replaceAll("ue");
word = PATTERN_SZ.matcher(word).replaceAll("ss");
return word;
}
public static class Metatag {
private final String vocName;
private final String print;
private final char prefix;
public Metatag(char prefix, String vocName, String print) {
this.prefix = prefix;
this.vocName = vocName;
this.print = print;
}
public Metatag(char prefix, String metatag) throws RuntimeException {
this.prefix = prefix;
assert metatag.charAt(0) == prefix;
int p = metatag.indexOf(':');
if (p < 0) throw new RuntimeException("bad metatag: metatag = " + metatag);
this.vocName = metatag.substring(1, p);
this.print = decodeMaskname(metatag.substring(p + 1));
}
public String getVocabularyName() {
return this.vocName;
}
public String getPrintName() {
return this.print;
}
@Override
public String toString() {
return this.prefix + this.vocName + ":" + encodePrintname(this.print);
}
@Override
public boolean equals(Object m) {
Metatag m0 = (Metatag) m;
return this.vocName.equals(m0.vocName) && this.print.equals(m0.print);
}
@Override
public int hashCode() {
return this.vocName.hashCode() + this.print.hashCode();
}
private final static Pattern PATTERN_UL = Pattern.compile("_");
private final static Pattern PATTERN_SP = Pattern.compile(" ");
public static final String encodePrintname(String printname) {
return PATTERN_SP.matcher(printname).replaceAll("_");
}
public static final String decodeMaskname(String maskname) {
return PATTERN_UL.matcher(maskname).replaceAll(" ");
}
public static String cleanTagFromAutotagging(char prefix, final String tagString) {
if (tagString == null || tagString.length() == 0) return "";
String[] tags = PATTERN_SP.split(tagString);
StringBuilder sb = new StringBuilder(tagString.length());
for (String tag : tags) {
if (tag.length() > 0 && tag.charAt(0) != prefix) {
sb.append(tag).append(' ');
}
}
if (sb.length() == 0) return "";
return sb.substring(0, sb.length() - 1);
}
}
}

@ -72,6 +72,20 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
private SAXParser saxParser;
private final InputSource inputSource;
private final InputStream inputStream;
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
public SurrogateReader(final InputStream stream, int queueSize) throws IOException {
this.buffer = new StringBuilder(300);
@ -85,12 +99,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
this.inputSource.setEncoding("UTF-8");
this.inputStream = stream;
final SAXParserFactory factory = SAXParserFactory.newInstance();
try {
this.saxParser = factory.newSAXParser();
} catch (ParserConfigurationException e) {
Log.logException(e);
throw new IOException(e.getMessage());
this.saxParser = getParser();
} catch (SAXException e) {
Log.logException(e);
throw new IOException(e.getMessage());

@ -102,6 +102,20 @@ public class OAIListFriendsLoader {
return map;
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
// get a resumption token using a SAX xml parser from am input stream
private static class Parser extends DefaultHandler {
@ -120,10 +134,9 @@ public class OAIListFriendsLoader {
this.buffer = new StringBuilder();
this.parsingValue = false;
this.atts = null;
final SAXParserFactory factory = SAXParserFactory.newInstance();
this.stream = new ByteArrayInputStream(b);
try {
this.saxParser = factory.newSAXParser();
this.saxParser = getParser();
this.saxParser.parse(this.stream, this);
} catch (final SAXException e) {
Log.logException(e);
@ -131,10 +144,6 @@ public class OAIListFriendsLoader {
} catch (final IOException e) {
Log.logException(e);
Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + UTF8.String(b));
} catch (final ParserConfigurationException e) {
Log.logException(e);
Log.logWarning("OAIListFriendsLoader.Parser", "OAIListFriends was not parsed:\n" + UTF8.String(b));
throw new IOException(e.getMessage());
} finally {
try {
this.stream.close();

@ -246,6 +246,20 @@ public class ResumptionToken extends TreeMap<String, String> {
", cursor=" + getCursor() + ", token=" + getToken();
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
// get a resumption token using a SAX xml parser from am input stream
private class Parser extends DefaultHandler {
@ -255,15 +269,14 @@ public class ResumptionToken extends TreeMap<String, String> {
private SAXParser saxParser;
private final InputStream stream;
private Attributes atts;
public Parser(final byte[] b) throws IOException {
this.buffer = new StringBuilder();
this.parsingValue = false;
this.atts = null;
final SAXParserFactory factory = SAXParserFactory.newInstance();
this.stream = new ByteArrayInputStream(b);
try {
this.saxParser = factory.newSAXParser();
this.saxParser = getParser();
this.saxParser.parse(this.stream, this);
} catch (final SAXException e) {
Log.logException(e);
@ -271,9 +284,6 @@ public class ResumptionToken extends TreeMap<String, String> {
} catch (final IOException e) {
Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed (2):\n" + UTF8.String(b));
} catch (final ParserConfigurationException e) {
Log.logException(e);
Log.logWarning("ResumptionToken", "token was not parsed (3):\n" + UTF8.String(b));
throw new IOException(e.getMessage());
} finally {
try {

@ -51,6 +51,20 @@ public class mmParser extends AbstractParser implements Parser {
SUPPORTED_MIME_TYPES.add("application/freemind");
SUPPORTED_MIME_TYPES.add("application/x-freemind");
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
public Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source)
@ -61,7 +75,7 @@ public class mmParser extends AbstractParser implements Parser {
byte[] content = new byte[0];
try {
final SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser();
final SAXParser saxParser = getParser();
final FreeMindHandler freeMindHandler = new FreeMindHandler();
saxParser.parse(source, freeMindHandler);
@ -76,8 +90,6 @@ public class mmParser extends AbstractParser implements Parser {
content = UTF8.getBytes(sb.toString());
} catch (ParserConfigurationException ex) {
log.logWarning(ex.getMessage());
} catch (SAXException ex) {
log.logWarning(ex.getMessage());
} catch (IOException ex) {

@ -35,9 +35,12 @@ import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.SAXException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
@ -87,6 +90,20 @@ public class odtParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/OOo-writer");
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
private Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final File dest)
throws Parser.Failure, InterruptedException {
@ -103,8 +120,7 @@ public class odtParser extends AbstractParser implements Parser {
// opening the file as zip file
final ZipFile zipFile = new ZipFile(dest);
final Enumeration<? extends ZipEntry> zipEnum = zipFile.entries();
final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// looping through all containing files
while (zipEnum.hasMoreElements()) {
@ -121,7 +137,7 @@ public class odtParser extends AbstractParser implements Parser {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
final SAXParser saxParser = saxParserFactory.newSAXParser();
final SAXParser saxParser = getParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
} finally {
// close readers and writers
@ -133,7 +149,7 @@ public class odtParser extends AbstractParser implements Parser {
} else if (entryName.equals("meta.xml")) {
// meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
final SAXParser saxParser = saxParserFactory.newSAXParser();
final SAXParser saxParser = getParser();
final ODMetaHandler metaData = new ODMetaHandler();
saxParser.parse(zipFileEntryStream, metaData);
docDescription = metaData.getDescription();

@ -35,9 +35,12 @@ import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.SAXException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
@ -71,6 +74,20 @@ public class ooxmlParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.template");
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
private Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final File dest) throws Parser.Failure, InterruptedException {
CharBuffer writer = null;
@ -85,8 +102,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
// opening the file as zip file
final ZipFile zipFile= new ZipFile(dest);
final Enumeration<? extends ZipEntry> zipEnum = zipFile.entries();
final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// looping through all containing files
while (zipEnum.hasMoreElements()) {
@ -105,7 +121,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
final SAXParser saxParser = saxParserFactory.newSAXParser();
final SAXParser saxParser = getParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers
@ -118,7 +134,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
} else if (entryName.equals("docProps/core.xml")) {
// meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
final SAXParser saxParser = saxParserFactory.newSAXParser();
final SAXParser saxParser = getParser();
final ODMetaHandler metaData = new ODMetaHandler();
saxParser.parse(zipFileEntryStream, metaData);
docDescription = metaData.getDescription();

@ -33,6 +33,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
@ -113,11 +114,24 @@ public class opensearchdescriptionReader extends DefaultHandler {
this.imageURL = null;
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
public opensearchdescriptionReader(final String path) {
this();
try {
final SAXParserFactory factory = SAXParserFactory.newInstance();
final SAXParser saxParser = factory.newSAXParser();
final SAXParser saxParser = getParser();
saxParser.parse(path, this);
} catch (final Exception e) {
Log.logException(e);
@ -127,8 +141,7 @@ public class opensearchdescriptionReader extends DefaultHandler {
public opensearchdescriptionReader(final InputStream stream) {
this();
try {
final SAXParserFactory factory = SAXParserFactory.newInstance();
final SAXParser saxParser = factory.newSAXParser();
final SAXParser saxParser = getParser();
saxParser.parse(stream, this);
} catch (final Exception e) {
Log.logException(e);

Loading…
Cancel
Save