refactoring of the ParserDispatcher and ParserConfig: resulted into Idiom, Parser and Classification classes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6188 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 8ca1f5d400
commit 21b8704fb4

@ -37,7 +37,7 @@ import java.util.StringTokenizer;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.Classification;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpd;
@ -469,7 +469,7 @@ public class SettingsAck_p {
int enabledMimesCount = 0;
final StringBuilder currEnabledMimesTxt = new StringBuilder();
final String[] enabledMimes = ParserDispatcher.setEnabledParserList(newConfig);
final String[] enabledMimes = Classification.setEnabledParserList(newConfig);
Arrays.sort(enabledMimes);
currEnabledMimesTxt.setLength(0);

@ -28,8 +28,9 @@ import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserDispatcher;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
@ -218,9 +219,9 @@ public final class Settings_p {
*/
int parserIdx = 0;
final Iterator<Parser> availableParserIter = ParserDispatcher.availableParserList.values().iterator();
final Iterator<Idiom> availableParserIter = Parser.availableParserList.values().iterator();
while (availableParserIter.hasNext()) {
final Parser parserInfo = availableParserIter.next();
final Idiom parserInfo = availableParserIter.next();
prop.put("parser_" + parserIdx + "_name", parserInfo.getName());
int mimeIdx = 0;
@ -228,7 +229,7 @@ public final class Settings_p {
while (mimeTypeIter.hasMoreElements()) {
final String mimeType = mimeTypeIter.nextElement();
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType);
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (ParserDispatcher.supportedMimeTypesContains(mimeType)) ? 1 : 0);
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (Classification.supportedMimeTypesContains(mimeType)) ? 1 : 0);
mimeIdx++;
}
prop.put("parser_" + parserIdx + "_mime", mimeIdx);

@ -32,7 +32,7 @@ import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.Classification;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
@ -218,14 +218,14 @@ public class FTPLoader {
private httpDocument getFile(final ftpc ftpClient, final CrawlEntry entry) throws Exception {
// determine the mimetype of the resource
final yacyURL entryUrl = entry.url();
final String extension = ParserDispatcher.getFileExt(entryUrl);
final String mimeType = ParserDispatcher.getMimeTypeByFileExt(extension);
final String extension = Classification.getFileExt(entryUrl);
final String mimeType = Classification.getMimeTypeByFileExt(extension);
final String path = getPath(entryUrl);
// if the mimetype and file extension is supported we start to download
// the file
httpDocument htCache = null;
if (ParserDispatcher.supportedContent(entryUrl, mimeType)) {
if (Classification.supportedContent(entryUrl, mimeType)) {
// aborting download if content is too long
final int size = ftpClient.fileSize(path);
if (size <= maxFileSize || maxFileSize == -1) {

@ -29,7 +29,7 @@ import java.io.IOException;
import java.util.Date;
import de.anomic.data.Blacklist;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.Classification;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;
import de.anomic.http.httpResponse;
@ -156,7 +156,7 @@ public final class HTTPLoader {
// request has been placed and result has been returned. work off response
//try {
if (ParserDispatcher.supportedContent(entry.url(), res.getResponseHeader().mime())) {
if (Classification.supportedContent(entry.url(), res.getResponseHeader().mime())) {
// get the content length and check if the length is allowed
long contentLength = res.getResponseHeader().getContentLength();

@ -38,12 +38,12 @@ import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
/**
* New classes implementing the {@link de.anomic.document.Parser} interface
* New classes implementing the {@link de.anomic.document.Idiom} interface
* can extend this class to inherit all functions already implemented in this class.
* @author Martin Thelian
* @version $LastChangedRevision$ / $LastChangedDate$
*/
public abstract class AbstractParser implements Parser {
public abstract class AbstractParser implements Idiom {
/**
* the logger class that should be used by the parser module for logging
@ -125,9 +125,9 @@ public abstract class AbstractParser implements Parser {
// XXX: workaround for relative paths within document
+ file.getPath().substring(file.getPath().indexOf(File.separatorChar) + 1)
+ "/" + file.getName());
final Document subdoc = ParserDispatcher.parseSource(
final Document subdoc = Parser.parseSource(
url,
ParserDispatcher.getMimeTypeByFileExt(files[i].substring(files[i].indexOf('.') + 1)),
Classification.getMimeTypeByFileExt(files[i].substring(files[i].indexOf('.') + 1)),
null, file);
// TODO: change anchors back to use '#' after archive name
doc.addSubDocument(subdoc);
@ -150,7 +150,7 @@ public abstract class AbstractParser implements Parser {
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.document.Parser#parse(de.anomic.net.URL, java.lang.String, byte[])
* @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, byte[])
*/
public Document parse(
final yacyURL location,
@ -185,7 +185,7 @@ public abstract class AbstractParser implements Parser {
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.document.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File)
* @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.File)
*/
public Document parse(
final yacyURL location,
@ -220,7 +220,7 @@ public abstract class AbstractParser implements Parser {
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.document.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
* @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
*/
public abstract Document parse(yacyURL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;

@ -0,0 +1,288 @@
// Classification.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 09.07.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
public class Classification {
public static final HashSet<String> supportedHTMLFileExt = new HashSet<String>();
public static final HashSet<String> supportedHTMLMimeTypes = new HashSet<String>();
private static final HashSet<String> mediaExtSet = new HashSet<String>();
private static final HashSet<String> imageExtSet = new HashSet<String>();
private static final HashSet<String> audioExtSet = new HashSet<String>();
private static final HashSet<String> videoExtSet = new HashSet<String>();
private static final HashSet<String> appsExtSet = new HashSet<String>();
private static final Properties mimeTypeLookupByFileExt = new Properties();
public final static HashSet<String> enabledParserList = new HashSet<String>();
private final static HashSet<String> supportedFileExt = new HashSet<String>();
static {
// load a list of extensions from file
BufferedInputStream bufferedIn = null;
try {
mimeTypeLookupByFileExt.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime"))));
} catch (final IOException e) {
System.err.println("ERROR: httpd.mime not found in settings path");
} finally {
if (bufferedIn != null) try {
bufferedIn.close();
} catch (final Exception e) {}
}
final String apps = "sit,hqx,img,dmg,exe,com,bat,sh,vbs,zip,jar";
final String audio = "mp2,mp3,ogg,aac,aif,aiff,wav";
final String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v";
final String image = "jpg,jpeg,jpe,gif,png,ico,bmp";
imageExtSet.addAll(extString2extList(image)); // image formats
audioExtSet.addAll(extString2extList(audio)); // audio formats
videoExtSet.addAll(extString2extList(video)); // video formats
appsExtSet.addAll(extString2extList(apps)); // application formats
initMediaExt(extString2extList(apps + "," + // application container
"tar,gz,bz2,arj,zip,rar," + // archive formats
"ps,xls,ppt,asf," + // text formats without support
audio + "," + // audio formats
video + "," + // video formats
image // image formats
));
}
public static List<String> extString2extList(final String extString) {
final LinkedList<String> extensions = new LinkedList<String>();
if ((extString == null) || (extString.length() == 0)) {
return extensions;
}
final String[] xs = extString.split(",");
for (int i = 0; i < xs.length; i++)
extensions.add(xs[i].toLowerCase().trim());
return extensions;
}
public static void initMediaExt(final List<String> mediaExtList) {
mediaExtSet.addAll(mediaExtList);
}
public static boolean mediaExtContains(String mediaExt) {
if (mediaExt == null) return false;
mediaExt = mediaExt.trim().toLowerCase();
if (supportedHTMLFileExt.contains(mediaExt)) return false;
if (supportedFileExtContains(mediaExt)) return false;
return mediaExtSet.contains(mediaExt);
}
public static boolean imageExtContains(final String imageExt) {
if (imageExt == null) return false;
return imageExtSet.contains(imageExt.trim().toLowerCase());
}
public static boolean audioExtContains(final String audioExt) {
if (audioExt == null) return false;
return audioExtSet.contains(audioExt.trim().toLowerCase());
}
public static boolean videoExtContains(final String videoExt) {
if (videoExt == null) return false;
return videoExtSet.contains(videoExt.trim().toLowerCase());
}
public static boolean appsExtContains(final String appsExt) {
if (appsExt == null) return false;
return appsExtSet.contains(appsExt.trim().toLowerCase());
}
public static void initHTMLParsableMimeTypes(
final String htmlParsableMimeTypes) {
final LinkedList<String> mimeTypes = new LinkedList<String>();
if ((htmlParsableMimeTypes == null) || (htmlParsableMimeTypes.length() == 0)) {
return;
}
final String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes
.split(",");
for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) {
mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim());
}
supportedHTMLMimeTypes.addAll(mimeTypes);
}
public static String normalizeMimeType(String mimeType) {
// if (mimeType == null) doMimeTypeAnalysis
if (mimeType == null) mimeType = "application/octet-stream";
mimeType = mimeType.trim().toLowerCase();
final int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
}
public static String getMimeTypeByFileExt(final String fileExt) {
return mimeTypeLookupByFileExt.getProperty(fileExt, "application/octet-stream");
}
public static void initSupportedHTMLFileExt(final List<String> supportedRealtimeFileExtList) {
supportedHTMLFileExt.addAll(supportedRealtimeFileExtList);
}
static boolean HTMLParsableMimeTypesContains(String mimeType) {
mimeType = normalizeMimeType(mimeType);
return supportedHTMLMimeTypes.contains(mimeType);
}
public static boolean supportedContent(final yacyURL url, String mimeType) {
mimeType = Classification.normalizeMimeType(mimeType);
if (
mimeType.equals("text/html") ||
mimeType.equals("application/xhtml+xml") ||
mimeType.equals("text/plain")
) {
return supportedMimeTypesContains(mimeType);
}
return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
}
public static boolean supportedMimeTypesContains(String mimeType) {
mimeType = Classification.normalizeMimeType(mimeType);
if (Classification.supportedHTMLMimeTypes.contains(mimeType)) return true;
return enabledParserList.contains(mimeType);
}
private static boolean supportedFileExt(final yacyURL url) {
if (url == null) throw new NullPointerException();
// getting the file path
final String name = getFileExt(url);
return supportedFileExtContains(name);
}
public static boolean supportedFileExtContains(String fileExt) {
if (fileExt == null) return false;
fileExt = fileExt.trim().toLowerCase();
if (Classification.supportedHTMLFileExt.contains(fileExt)) return true;
return supportedFileExt.contains(fileExt);
}
public static void addParseableMimeTypes(final String enabledMimeTypes) {
HashSet<String> mimeTypes = null;
if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) {
mimeTypes = new HashSet<String>();
} else {
final String[] enabledMimeTypeList = enabledMimeTypes.split(",");
mimeTypes = new HashSet<String>(enabledMimeTypeList.length);
for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim());
}
setEnabledParserList(mimeTypes);
}
public static void enableAllParsers() {
final Set<String> availableMimeTypes = Parser.availableParserList.keySet();
setEnabledParserList(availableMimeTypes);
}
public static String[] setEnabledParserList(final Set<String> mimeTypeSet) {
final HashSet<String> newEnabledParsers = new HashSet<String>();
final HashSet<String> newSupportedFileExt = new HashSet<String>();
if (mimeTypeSet != null) {
final Iterator<String> mimeTypes = mimeTypeSet.iterator();
while (mimeTypes.hasNext()) {
final String mimeType = mimeTypes.next();
Idiom theParser = Parser.availableParserList.get(mimeType);
if (theParser != null) {
try {
// getting a list of mimeTypes that the parser supports
final Hashtable<String, String> parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
if (parserSupportsMimeTypes != null) {
final Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
if ((supportedExtensions != null) &&
(supportedExtensions instanceof String) &&
(((String)supportedExtensions).length() > 0)) {
final String[] extArray = ((String)supportedExtensions).split(",");
newSupportedFileExt.addAll(Arrays.asList(extArray));
}
}
newEnabledParsers.add(mimeType);
} catch (final Exception e) {
Log.logSevere("PARSER", "error in setEnabledParserList", e);
} finally {
if (theParser != null)
theParser = null; // destroy object
}
}
}
}
enabledParserList.addAll(newEnabledParsers);
supportedFileExt.addAll(newSupportedFileExt);
return newEnabledParsers.toArray(new String[newEnabledParsers.size()]);
}
@SuppressWarnings("unchecked")
public static HashSet<String> getEnabledParserList() {
return (HashSet<String>) enabledParserList.clone();
}
public static String getFileExt(final yacyURL url) {
// getting the file path
String name = url.getPath();
// tetermining last position of / in the file path
int p = name.lastIndexOf('/');
if (p != -1) {
name = name.substring(p);
}
// termining last position of . in file path
p = name.lastIndexOf('.');
if (p < 0)
return "";
return name.substring(p + 1);
}
}

@ -33,7 +33,6 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Enumeration;
@ -295,33 +294,14 @@ public final class Condenser {
int idx;
int wordInSentenceCounter = 1;
boolean comb_indexof = false, last_last = false, last_index = false;
RandomAccessFile fa;
final boolean dumpWords = false;
final HashMap<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>();
if (dumpWords) try {
fa = new RandomAccessFile(new File("dump.txt"), "rw");
fa.seek(fa.length());
} catch (final IOException e) {
e.printStackTrace();
fa = null;
}
// read source
final sievedWordsEnum wordenum = new sievedWordsEnum(is);
while (wordenum.hasMoreElements()) {
word = (new String(wordenum.nextElement())).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars?
if (languageIdentificator != null) languageIdentificator.add(word);
if (word.length() < wordminsize) continue;
//System.out.println("PARSED-WORD " + word);
//This is useful for testing what YaCy "sees" of a website.
if (dumpWords && fa != null) try {
fa.writeBytes(word);
fa.write(160);
} catch (final IOException e) {
e.printStackTrace();
}
// distinguish punctuation and words
wordlen = word.length();
@ -397,15 +377,6 @@ public final class Condenser {
sentences.put(sentence, new Phrase(sentenceHandleCount++));
}
}
if (dumpWords && fa != null) try {
fa.write('\n');
fa.close();
} catch (final IOException e) {
e.printStackTrace();
}
// -------------------
// we reconstruct the sentence hashtable
// and order the entries by the number of the sentence

@ -30,9 +30,12 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
@ -97,7 +100,7 @@ public class Document {
this.languages = languages;
if (text == null) try {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
this.text = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
} catch (final IOException e) {
e.printStackTrace();
this.text = new StringBuilder();
@ -371,14 +374,14 @@ dc_rights
} else {
ext = u.substring(extpos + 1).toLowerCase();
}
if (ParserDispatcher.mediaExtContains(ext)) {
if (Classification.mediaExtContains(ext)) {
// this is not a normal anchor, its a media link
if (ParserDispatcher.imageExtContains(ext)) {
if (Classification.imageExtContains(ext)) {
ContentScraper.addImage(collectedImages, new ImageEntry(url, entry.getValue(), -1, -1));
}
else if (ParserDispatcher.audioExtContains(ext)) audiolinks.put(url, entry.getValue());
else if (ParserDispatcher.videoExtContains(ext)) videolinks.put(url, entry.getValue());
else if (ParserDispatcher.appsExtContains(ext)) applinks.put(url, entry.getValue());
else if (Classification.audioExtContains(ext)) audiolinks.put(url, entry.getValue());
else if (Classification.videoExtContains(ext)) videolinks.put(url, entry.getValue());
else if (Classification.appsExtContains(ext)) applinks.put(url, entry.getValue());
} else {
hyperlinks.put(url, entry.getValue());
}
@ -396,21 +399,117 @@ dc_rights
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(ParserDispatcher.allReflinks(images.values()));
hyperlinks.putAll(ParserDispatcher.allReflinks(audiolinks.keySet()));
hyperlinks.putAll(ParserDispatcher.allReflinks(videolinks.keySet()));
hyperlinks.putAll(ParserDispatcher.allReflinks(applinks.keySet()));
hyperlinks.putAll(allReflinks(images.values()));
hyperlinks.putAll(allReflinks(audiolinks.keySet()));
hyperlinks.putAll(allReflinks(videolinks.keySet()));
hyperlinks.putAll(allReflinks(applinks.keySet()));
/*
hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(images.values()));
hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet()));
hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet()));
hyperlinks.putAll(allSubpaths(hyperlinks.keySet()));
hyperlinks.putAll(allSubpaths(images.values()));
hyperlinks.putAll(allSubpaths(audiolinks.keySet()));
hyperlinks.putAll(allSubpaths(videolinks.keySet()));
hyperlinks.putAll(allSubpaths(applinks.keySet()));
*/
// don't do this again
this.resorted = true;
}
public static Map<yacyURL, String> allSubpaths(final Collection<?> links) {
// links is either a Set of Strings (urls) or a Set of
// htmlFilterImageEntries
final HashSet<String> h = new HashSet<String>();
Iterator<?> i = links.iterator();
Object o;
yacyURL url;
String u;
int pos;
int l;
while (i.hasNext())
try {
o = i.next();
if (o instanceof yacyURL) url = (yacyURL) o;
else if (o instanceof String) url = new yacyURL((String) o, null);
else if (o instanceof ImageEntry) url = ((ImageEntry) o).url();
else {
assert false;
continue;
}
u = url.toNormalform(true, true);
if (u.endsWith("/"))
u = u.substring(0, u.length() - 1);
pos = u.lastIndexOf('/');
while (pos > 8) {
l = u.length();
u = u.substring(0, pos + 1);
h.add(u);
u = u.substring(0, pos);
assert (u.length() < l) : "u = " + u;
pos = u.lastIndexOf('/');
}
} catch (final MalformedURLException e) { }
// now convert the strings to yacyURLs
i = h.iterator();
final HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
while (i.hasNext()) {
u = (String) i.next();
try {
url = new yacyURL(u, null);
v.put(url, "sub");
} catch (final MalformedURLException e) {
}
}
return v;
}
public static Map<yacyURL, String> allReflinks(final Collection<?> links) {
// links is either a Set of Strings (with urls) or
// htmlFilterImageEntries
// we find all links that are part of a reference inside a url
final HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
final Iterator<?> i = links.iterator();
Object o;
yacyURL url;
String u;
int pos;
loop: while (i.hasNext())
try {
o = i.next();
if (o instanceof yacyURL)
url = (yacyURL) o;
else if (o instanceof String)
url = new yacyURL((String) o, null);
else if (o instanceof ImageEntry)
url = ((ImageEntry) o).url();
else {
assert false;
continue;
}
u = url.toNormalform(true, true);
if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) {
i.remove();
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
u = u.substring(pos);
url = new yacyURL(u, null);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
}
if ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) {
i.remove();
u = "http:/" + u.substring(pos);
while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0)
u = "http:/" + u.substring(pos);
url = new yacyURL(u, null);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
}
} catch (final MalformedURLException e) {
}
return v;
}
public void addSubDocument(final Document doc) throws IOException {
this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
@ -423,7 +522,7 @@ dc_rights
this.description.append(doc.dc_description());
if (!(this.text instanceof serverCachedFileOutputStream)) {
this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
this.text = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
FileUtils.copy(getText(), (serverCachedFileOutputStream)this.text);
}
FileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);

@ -0,0 +1,108 @@
//Idiom.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//this file was contributed by Martin Thelian
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
import java.io.File;
import java.io.InputStream;
import java.util.Hashtable;
import de.anomic.yacy.yacyURL;
/**
* This interface defines a list of methods that needs to be implemented
* by each content parser class.
* @author Martin Thelian
* @version $LastChangedRevision$ / $LastChangedDate$
*/
public interface Idiom {
public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024;
/**
* Parsing a document available as byte array
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the content byte array
* @return a {@link Document} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public Document parse(yacyURL location, String mimeType, String charset, byte[] source)
throws ParserException, InterruptedException;
/**
* Parsing a document stored in a {@link File}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param sourceFile the file containing the content of the document
* @return a {@link Document} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public Document parse(yacyURL location, String mimeType, String charset, File sourceFile)
throws ParserException, InterruptedException;
/**
* Parsing a document available as {@link InputStream}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the {@link InputStream} containing the document content
* @return a {@link Document} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public Document parse(yacyURL location, String mimeType, String charset, InputStream source)
throws ParserException, InterruptedException;
/**
* Can be used to determine the MimeType(s) that are supported by the parser
* @return a {@link Hashtable} containing a list of MimeTypes that are supported by
* the parser
*/
public Hashtable<String, String> getSupportedMimeTypes();
/**
* This function should be called before reusing the parser object.
*/
public void reset();
public void setContentLength(long length);
/**
* Returns the name of the parser
* @return parser name
*/
public String getName();
}

@ -1,108 +1,190 @@
//Parser.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//this file was contributed by Martin Thelian
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
import java.io.File;
import java.io.InputStream;
import java.util.Hashtable;
import de.anomic.yacy.yacyURL;
/**
* This interface defines a list of methods that needs to be implemented
* by each content parser class.
* @author Martin Thelian
* @version $LastChangedRevision$ / $LastChangedDate$
*/
public interface Parser {
public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024;
/**
* Parsing a document available as byte array
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the content byte array
* @return a {@link Document} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public Document parse(yacyURL location, String mimeType, String charset, byte[] source)
throws ParserException, InterruptedException;
/**
* Parsing a document stored in a {@link File}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param sourceFile the file containing the content of the document
* @return a {@link Document} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public Document parse(yacyURL location, String mimeType, String charset, File sourceFile)
throws ParserException, InterruptedException;
/**
* Parsing a document available as {@link InputStream}
* @param location the origin of the document
* @param mimeType the mimetype of the document
* @param charset the supposed charset of the document or <code>null</code> if unkown
* @param source the {@link InputStream} containing the document content
* @return a {@link Document} containing the extracted plain text of the document
* and some additional metadata.
*
* @throws ParserException if the content could not be parsed properly
*/
public Document parse(yacyURL location, String mimeType, String charset, InputStream source)
throws ParserException, InterruptedException;
/**
* Can be used to determine the MimeType(s) that are supported by the parser
* @return a {@link Hashtable} containing a list of MimeTypes that are supported by
* the parser
*/
public Hashtable<String, String> getSupportedMimeTypes();
/**
* This function should be called before reusing the parser object.
*/
public void reset();
public void setContentLength(long length);
/**
* Returns the name of the parser
* @return parser name
*/
public String getName();
}
// Parser.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 09.07.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import de.anomic.document.parser.bzipParser;
import de.anomic.document.parser.docParser;
import de.anomic.document.parser.gzipParser;
import de.anomic.document.parser.htmlParser;
import de.anomic.document.parser.mimeTypeParser;
import de.anomic.document.parser.odtParser;
import de.anomic.document.parser.pdfParser;
import de.anomic.document.parser.pptParser;
import de.anomic.document.parser.psParser;
import de.anomic.document.parser.rpmParser;
import de.anomic.document.parser.rssParser;
import de.anomic.document.parser.rtfParser;
import de.anomic.document.parser.sevenzipParser;
import de.anomic.document.parser.swfParser;
import de.anomic.document.parser.tarParser;
import de.anomic.document.parser.vcfParser;
import de.anomic.document.parser.vsdParser;
import de.anomic.document.parser.xlsParser;
import de.anomic.document.parser.zipParser;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
public final class Parser {
private static final Log theLogger = new Log("PARSER");
public static final HashMap<String, Idiom> availableParserList = new HashMap<String, Idiom>();
static {
initParser(new bzipParser());
initParser(new docParser());
initParser(new gzipParser());
initParser(new mimeTypeParser());
initParser(new odtParser());
initParser(new pdfParser());
initParser(new pptParser());
initParser(new psParser());
initParser(new rpmParser());
initParser(new rssParser());
initParser(new rtfParser());
initParser(new sevenzipParser());
initParser(new swfParser());
initParser(new tarParser());
initParser(new vcfParser());
initParser(new vsdParser());
initParser(new xlsParser());
initParser(new zipParser());
}
private static void initParser(Idiom theParser) {
final Hashtable<String, String> supportedMimeTypes = theParser.getSupportedMimeTypes();
final Iterator<String> mimeTypeIterator = supportedMimeTypes.keySet().iterator();
while (mimeTypeIterator.hasNext()) {
final String mimeType = mimeTypeIterator.next();
availableParserList.put(mimeType, theParser);
Log.logInfo("PARSER", "Found parser for mimeType '" + mimeType + "': " + theParser.getName());
}
}
public static Document parseSource(final yacyURL location,
final String mimeType, final String charset,
final byte[] sourceArray) throws InterruptedException,
ParserException {
ByteArrayInputStream byteIn = null;
try {
if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from byte-array");
if (sourceArray == null || sourceArray.length == 0) {
final String errorMsg = "No resource content available (1) " + (((sourceArray == null) ? "source == null" : "source.length() == 0") + ", url = " + location.toNormalform(true, false));
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, errorMsg);
}
byteIn = new ByteArrayInputStream(sourceArray);
return parseSource(location, mimeType, charset, sourceArray.length, byteIn);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location, location, e);
} finally {
if (byteIn != null) try {
byteIn.close();
} catch (final Exception ex) { }
}
}
public static Document parseSource(final yacyURL location,
final String mimeType, final String charset,
final File sourceFile) throws InterruptedException, ParserException {
BufferedInputStream sourceStream = null;
try {
if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from file");
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2).";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "document has no content");
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
return parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location, location, e);
} finally {
if (sourceStream != null)try {
sourceStream.close();
} catch (final Exception ex) {}
}
}
public static Document parseSource(final yacyURL location,
String mimeType, final String charset,
final long contentLength, final InputStream sourceStream)
throws InterruptedException, ParserException {
try {
if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from stream");
mimeType = Classification.normalizeMimeType(mimeType);
final String fileExt = Classification.getFileExt(location);
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
if (!Classification.supportedContent(location, mimeType)) {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (1)";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong mime type or wrong extension");
}
if (theLogger.isFine()) theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
Idiom parser = availableParserList.get(Classification.normalizeMimeType(mimeType));
Document doc = null;
if (parser != null) {
parser.setContentLength(contentLength);
doc = parser.parse(location, mimeType, documentCharset, sourceStream);
} else if (Classification.HTMLParsableMimeTypesContains(mimeType)) {
doc = new htmlParser().parse(location, mimeType, documentCharset, sourceStream);
} else {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong mime type or wrong extension");
}
if (doc == null) {
final String errorMsg = "Unexpected error. Parser returned null.";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
return doc;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
final String errorMsg = "Unexpected exception. " + e.getMessage();
theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
throw new ParserException(errorMsg, location, e);
}
}
}

@ -1,174 +0,0 @@
// plasmaParserConfig.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Martin Thelian
//
// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
// $LastChangedRevision: 1715 $
// $LastChangedBy: theli $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Set;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
public class ParserConfig {
/**
* A list containing all enabled parsers and the mimeType that they can handle
* @see #loadEnabledParserList()
* @see #setEnabledParserList(Enumeration)
*/
public final HashSet<String> enabledParserList;
/**
* A list of file extensions that are supported by all enabled parsers
*/
private final HashSet<String> supportedFileExt;
public ParserConfig() {
supportedFileExt = new HashSet<String>();
enabledParserList = new HashSet<String>();
}
public boolean supportedContent(final yacyURL url, String mimeType) {
// TODO: we need some exceptions here to index URLs like this
// http://www.musicabona.com/respighi/12668/cd/index.html.fr
mimeType = ParserDispatcher.normalizeMimeType(mimeType);
if (
mimeType.equals("text/html") ||
mimeType.equals("application/xhtml+xml") ||
mimeType.equals("text/plain")
) {
return supportedMimeTypesContains(mimeType);
}
return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
}
public boolean supportedMimeTypesContains(String mimeType) {
mimeType = ParserDispatcher.normalizeMimeType(mimeType);
synchronized (ParserDispatcher.supportedHTMLMimeTypes) {
if (ParserDispatcher.supportedHTMLMimeTypes.contains(mimeType)) return true;
}
synchronized (this.enabledParserList) {
return this.enabledParserList.contains(mimeType);
}
}
private boolean supportedFileExt(final yacyURL url) {
if (url == null) throw new NullPointerException();
// getting the file path
final String name = ParserDispatcher.getFileExt(url);
return supportedFileExtContains(name);
}
public boolean supportedFileExtContains(String fileExt) {
if (fileExt == null) return false;
fileExt = fileExt.trim().toLowerCase();
synchronized (ParserDispatcher.supportedHTMLFileExt) {
if (ParserDispatcher.supportedHTMLFileExt.contains(fileExt)) return true;
}
synchronized(this.supportedFileExt) {
return this.supportedFileExt.contains(fileExt);
}
}
public void addParseableMimeTypes(final String enabledMimeTypes) {
HashSet<String> mimeTypes = null;
if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) {
mimeTypes = new HashSet<String>();
} else {
final String[] enabledMimeTypeList = enabledMimeTypes.split(",");
mimeTypes = new HashSet<String>(enabledMimeTypeList.length);
for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim());
}
setEnabledParserList(mimeTypes);
}
public void enableAllParsers() {
final Set<String> availableMimeTypes = ParserDispatcher.availableParserList.keySet();
setEnabledParserList(availableMimeTypes);
}
public String[] setEnabledParserList(final Set<String> mimeTypeSet) {
final HashSet<String> newEnabledParsers = new HashSet<String>();
final HashSet<String> newSupportedFileExt = new HashSet<String>();
if (mimeTypeSet != null) {
final Iterator<String> mimeTypes = mimeTypeSet.iterator();
while (mimeTypes.hasNext()) {
final String mimeType = mimeTypes.next();
Parser theParser = ParserDispatcher.availableParserList.get(mimeType);
if (theParser != null) {
try {
// getting a list of mimeTypes that the parser supports
final Hashtable<String, String> parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
if (parserSupportsMimeTypes != null) {
final Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
if ((supportedExtensions != null) &&
(supportedExtensions instanceof String) &&
(((String)supportedExtensions).length() > 0)) {
final String[] extArray = ((String)supportedExtensions).split(",");
newSupportedFileExt.addAll(Arrays.asList(extArray));
}
}
newEnabledParsers.add(mimeType);
} catch (final Exception e) {
Log.logSevere("PARSER", "error in setEnabledParserList", e);
} finally {
if (theParser != null)
theParser = null; // destroy object
}
}
}
}
synchronized (this.enabledParserList) {
this.enabledParserList.addAll(newEnabledParsers);
}
synchronized (this.supportedFileExt) {
this.supportedFileExt.addAll(newSupportedFileExt);
}
return newEnabledParsers.toArray(new String[newEnabledParsers.size()]);
}
@SuppressWarnings("unchecked")
public HashSet<String> getEnabledParserList() {
synchronized (this.enabledParserList) {
return (HashSet<String>) this.enabledParserList.clone();
}
}
}

@ -1,576 +0,0 @@
package de.anomic.document;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import de.anomic.document.parser.bzipParser;
import de.anomic.document.parser.docParser;
import de.anomic.document.parser.gzipParser;
import de.anomic.document.parser.htmlParser;
import de.anomic.document.parser.mimeTypeParser;
import de.anomic.document.parser.odtParser;
import de.anomic.document.parser.pdfParser;
import de.anomic.document.parser.pptParser;
import de.anomic.document.parser.psParser;
import de.anomic.document.parser.rpmParser;
import de.anomic.document.parser.rssParser;
import de.anomic.document.parser.rtfParser;
import de.anomic.document.parser.sevenzipParser;
import de.anomic.document.parser.swfParser;
import de.anomic.document.parser.tarParser;
import de.anomic.document.parser.vcfParser;
import de.anomic.document.parser.vsdParser;
import de.anomic.document.parser.xlsParser;
import de.anomic.document.parser.zipParser;
import de.anomic.document.parser.html.ImageEntry;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
public final class ParserDispatcher {
public static final ParserConfig parserConfig = new ParserConfig();
/**
* A list containing all installed parsers and the mimeType that they support
* @see #loadAvailableParserList()
*/
public static final HashMap<String, Parser> availableParserList = new HashMap<String, Parser>();
/**
* A list of file extensions and mime types that are supported by the html-parser
*/
public static final HashSet<String> supportedHTMLFileExt = new HashSet<String>();
public static final HashSet<String> supportedHTMLMimeTypes = new HashSet<String>();
private static final Properties mimeTypeLookupByFileExt = new Properties();
static {
// loading a list of extensions from file
BufferedInputStream bufferedIn = null;
try {
mimeTypeLookupByFileExt.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime"))));
} catch (final IOException e) {
System.err.println("ERROR: httpd.mime not found in settings path");
} finally {
if (bufferedIn != null) try{bufferedIn.close();}catch(final Exception e){}
}
}
/**
* A list of media extensions that should <b>not</b> be handled by the Parser
*/
private static final HashSet<String> mediaExtSet = new HashSet<String>();
/**
* A list of image, audio, video and application extensions
*/
private static final HashSet<String> imageExtSet = new HashSet<String>();
private static final HashSet<String> audioExtSet = new HashSet<String>();
private static final HashSet<String> videoExtSet = new HashSet<String>();
private static final HashSet<String> appsExtSet = new HashSet<String>();
/**
* Initializing the
* @see #initMediaExt(String)
*/
static {
final String apps = "sit,hqx,img,dmg,exe,com,bat,sh,vbs,zip,jar";
final String audio = "mp2,mp3,ogg,aac,aif,aiff,wav";
final String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v";
final String image = "jpg,jpeg,jpe,gif,png,ico,bmp";
initMediaExt(extString2extList(
apps + "," + // application container
"tar,gz,bz2,arj,zip,rar," + // archive formats
"ps,xls,ppt,asf," + // text formats without support
audio + "," + // audio formats
video + "," + // video formats
image // image formats
));
initImageExt(extString2extList(image)); // image formats
initAudioExt(extString2extList(audio)); // audio formats
initVideoExt(extString2extList(video)); // video formats
initAppsExt(extString2extList(apps)); // application formats
/* ===================================================
* loading a list of available parsers
* =================================================== */
loadAvailableParserList();
}
private static final Log theLogger = new Log("PARSER");
/**
* This function is used to initialize the HTMLParsableMimeTypes List.
* This list contains a list of mimeTypes that can be parsed in realtime by
* the yacy html-Parser
* @param htmlParsableMimeTypes a list of mimetypes that can be parsed by the
* yacy html parser
*/
public static void initHTMLParsableMimeTypes(final String htmlParsableMimeTypes) {
final LinkedList<String> mimeTypes = new LinkedList<String>();
if ((htmlParsableMimeTypes == null) || (htmlParsableMimeTypes.length() == 0)) {
return;
}
final String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes.split(",");
for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) {
mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim());
}
synchronized (supportedHTMLMimeTypes) {
supportedHTMLMimeTypes.addAll(mimeTypes);
}
}
public static List<String> extString2extList(final String extString) {
final LinkedList<String> extensions = new LinkedList<String>();
if ((extString == null) || (extString.length() == 0)) {
return extensions;
}
final String[] xs = extString.split(",");
for (int i = 0; i < xs.length; i++) extensions.add(xs[i].toLowerCase().trim());
return extensions;
}
public static void initMediaExt(final List<String> mediaExtList) {
synchronized (mediaExtSet) {
mediaExtSet.addAll(mediaExtList);
}
}
private static void initImageExt(final List<String> imageExtList) {
synchronized (imageExtSet) {
imageExtSet.addAll(imageExtList);
}
}
private static void initAudioExt(final List<String> audioExtList) {
synchronized (audioExtSet) {
audioExtSet.addAll(audioExtList);
}
}
private static void initVideoExt(final List<String> videoExtList) {
synchronized (videoExtSet) {
videoExtSet.addAll(videoExtList);
}
}
private static void initAppsExt(final List<String> appsExtList) {
synchronized (appsExtSet) {
appsExtSet.addAll(appsExtList);
}
}
public static void initSupportedHTMLFileExt(final List<String> supportedRealtimeFileExtList) {
synchronized (supportedHTMLFileExt) {
supportedHTMLFileExt.addAll(supportedRealtimeFileExtList);
}
}
private static boolean HTMLParsableMimeTypesContains(String mimeType) {
mimeType = normalizeMimeType(mimeType);
synchronized (supportedHTMLMimeTypes) {
return supportedHTMLMimeTypes.contains(mimeType);
}
}
public static String getFileExt(final yacyURL url) {
// getting the file path
String name = url.getPath();
// tetermining last position of / in the file path
int p = name.lastIndexOf('/');
if (p != -1) {
name = name.substring(p);
}
// termining last position of . in file path
p = name.lastIndexOf('.');
if (p < 0) return "";
return name.substring(p + 1);
}
public static boolean mediaExtContains(String mediaExt) {
if (mediaExt == null) return false;
mediaExt = mediaExt.trim().toLowerCase();
synchronized (supportedHTMLFileExt) {
if (supportedHTMLFileExt.contains(mediaExt)) return false;
}
if (supportedFileExtContains(mediaExt)) return false;
synchronized (mediaExtSet) {
return mediaExtSet.contains(mediaExt);
}
}
public static boolean imageExtContains(final String imageExt) {
if (imageExt == null) return false;
synchronized (imageExtSet) {
return imageExtSet.contains(imageExt.trim().toLowerCase());
}
}
public static boolean audioExtContains(final String audioExt) {
if (audioExt == null) return false;
synchronized (audioExtSet) {
return audioExtSet.contains(audioExt.trim().toLowerCase());
}
}
public static boolean videoExtContains(final String videoExt) {
if (videoExt == null) return false;
synchronized (videoExtSet) {
return videoExtSet.contains(videoExt.trim().toLowerCase());
}
}
public static boolean appsExtContains(final String appsExt) {
if (appsExt == null) return false;
synchronized (appsExtSet) {
return appsExtSet.contains(appsExt.trim().toLowerCase());
}
}
public static String normalizeMimeType(String mimeType) {
//if (mimeType == null) doMimeTypeAnalysis
if (mimeType == null) mimeType = "application/octet-stream";
mimeType = mimeType.trim().toLowerCase();
final int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
}
public static String getMimeTypeByFileExt(final String fileExt) {
return mimeTypeLookupByFileExt.getProperty(fileExt,"application/octet-stream");
}
public static HashMap<String, Parser> getAvailableParserList() {
return availableParserList;
}
private static void loadAvailableParserList() {
initParser(new bzipParser());
initParser(new docParser());
initParser(new gzipParser());
initParser(new mimeTypeParser());
initParser(new odtParser());
initParser(new pdfParser());
initParser(new pptParser());
initParser(new psParser());
initParser(new rpmParser());
initParser(new rssParser());
initParser(new rtfParser());
initParser(new sevenzipParser());
initParser(new swfParser());
initParser(new tarParser());
initParser(new vcfParser());
initParser(new vsdParser());
initParser(new xlsParser());
initParser(new zipParser());
}
private static void initParser(Parser theParser) {
// loading the list of mime-types that are supported by this parser class
final Hashtable<String, String> supportedMimeTypes = theParser.getSupportedMimeTypes();
final Iterator<String> mimeTypeIterator = supportedMimeTypes.keySet().iterator();
while (mimeTypeIterator.hasNext()) {
final String mimeType = mimeTypeIterator.next();
availableParserList.put(mimeType, theParser);
Log.logInfo("PARSER", "Found parser for mimeType '" + mimeType + "'." +
"\n\tName: " + theParser.getName());
}
}
public static Document parseSource(final yacyURL location, final String mimeType, final String charset, final byte[] sourceArray)
throws InterruptedException, ParserException {
ByteArrayInputStream byteIn = null;
try {
if (theLogger.isFine())
theLogger.logFine("Parsing '" + location + "' from byte-array");
// testing if the resource is not empty
if (sourceArray == null || sourceArray.length == 0) {
final String errorMsg = "No resource content available (1) " + (((sourceArray == null) ? "source == null" : "source.length() == 0") + ", url = " + location.toNormalform(true, false));
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location, errorMsg);
}
// creating an InputStream
byteIn = new ByteArrayInputStream(sourceArray);
// parsing the temp file
return parseSource(location, mimeType, charset, sourceArray.length, byteIn);
} catch (final Exception e) {
// Interrupted- and Parser-Exceptions should pass through
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
// log unexpected error
theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location,location, e);
} finally {
if (byteIn != null) try { byteIn.close(); } catch (final Exception ex){/* ignore this */}
}
}
public static Document parseSource(final yacyURL location, final String theMimeType, final String theDocumentCharset, final File sourceFile) throws InterruptedException, ParserException {
BufferedInputStream sourceStream = null;
try {
if (theLogger.isFine())
theLogger.logFine("Parsing '" + location + "' from file");
// testing if the resource is not empty
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2).";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location, "document has no content");
}
// create a new InputStream
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
// parsing the data
return parseSource(location, theMimeType, theDocumentCharset, sourceFile.length(), sourceStream);
} catch (final Exception e) {
// Interrupted- and Parser-Exceptions should pass through
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
// log unexpected error
theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location,location, e);
} finally {
if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex){/* ignore this */}
}
}
/**
* To parse a resource from an {@link InputStream}
* @param location the URL of the resource
* @param theMimeType the resource mimetype (<code>null</code> if unknown)
* @param theDocumentCharset the charset of the resource (<code>null</code> if unknown)
* @param contentLength the content length of the resource (<code>-1</code> if unknown)
* @param sourceStream an {@link InputStream} containing the resource body
* @return the parsed {@link ParserDocument document}
* @throws InterruptedException
* @throws ParserException
*/
public static Document parseSource(final yacyURL location, final String theMimeType, final String theDocumentCharset, final long contentLength, final InputStream sourceStream) throws InterruptedException, ParserException {
Parser theParser = null;
String mimeType = null;
try {
if (theLogger.isFine())
theLogger.logFine("Parsing '" + location + "' from stream");
// getting the mimetype of the document
mimeType = normalizeMimeType(theMimeType);
// getting the file extension of the document
final String fileExt = getFileExt(location);
// getting the charset of the document
// TODO: do a charset detection here ....
final String documentCharset = htmlParser.patchCharsetEncoding(theDocumentCharset);
// testing if parsing is supported for this resource
if (!supportedContent(location,mimeType)) {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (1)";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location, "wrong mime type or wrong extension");
}
if (theLogger.isFine())
theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType +
"' and file extension '" + fileExt + "'.");
// getting the correct parser for the given mimeType
theParser = getParser(mimeType);
// if a parser was found we use it ...
Document doc = null;
if (theParser != null) {
// set the content length of the resource
theParser.setContentLength(contentLength);
// parse the resource
doc = theParser.parse(location, mimeType,documentCharset,sourceStream);
} else if (HTMLParsableMimeTypesContains(mimeType)) {
doc = new htmlParser().parse(location, mimeType, documentCharset, sourceStream);
} else {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location, "wrong mime type or wrong extension");
}
// check result
if (doc == null) {
final String errorMsg = "Unexpected error. Parser returned null.";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg,location);
}
return doc;
} catch (final Exception e) {
// Interrupted- and Parser-Exceptions should pass through
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
// log unexpected error
final String errorMsg = "Unexpected exception. " + e.getMessage();
theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
throw new ParserException(errorMsg,location,e);
} finally {
if (theParser != null) {
theParser = null; // delete object
}
}
}
/**
* This function is used to determine the parser class that should be used for a given
* mimetype ...
* @param mimeType MIME-Type of the resource
* @return the {@link Parser}-class that is supposed to parse the resource of
* the given MIME-Type
*/
private static Parser getParser(String mimeType) {
mimeType = normalizeMimeType(mimeType);
// determining the proper parser class name for the mimeType
return availableParserList.get(mimeType);
}
public static Map<yacyURL, String> allReflinks(final Collection<?> links) {
// links is either a Set of Strings (with urls) or htmlFilterImageEntries
// we find all links that are part of a reference inside a url
final HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
final Iterator<?> i = links.iterator();
Object o;
yacyURL url;
String u;
int pos;
loop: while (i.hasNext()) try {
o = i.next();
if (o instanceof yacyURL) url = (yacyURL) o;
else if (o instanceof String) url = new yacyURL((String) o, null);
else if (o instanceof ImageEntry) url = ((ImageEntry) o).url();
else {
assert false;
continue;
}
u = url.toNormalform(true, true);
if ((pos = u.toLowerCase().indexOf("http://",7)) > 0) {
i.remove();
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("http://",7)) > 0) u = u.substring(pos);
url = new yacyURL(u, null);
if (!(v.containsKey(url))) v.put(url, "ref");
continue loop;
}
if ((pos = u.toLowerCase().indexOf("/www.",7)) > 0) {
i.remove();
u = "http:/" + u.substring(pos);
while ((pos = u.toLowerCase().indexOf("/www.",7)) > 0) u = "http:/" + u.substring(pos);
url = new yacyURL(u, null);
if (!(v.containsKey(url))) v.put(url, "ref");
continue loop;
}
} catch (final MalformedURLException e) {}
return v;
}
static Map<yacyURL, String> allSubpaths(final Collection<?> links) {
// links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries
final HashSet<String> h = new HashSet<String>();
Iterator<?> i = links.iterator();
Object o;
yacyURL url;
String u;
int pos;
int l;
while (i.hasNext()) try {
o = i.next();
if (o instanceof yacyURL) url = (yacyURL) o;
else if (o instanceof String) url = new yacyURL((String) o, null);
else if (o instanceof ImageEntry) url = ((ImageEntry) o).url();
else {
assert false;
continue;
}
u = url.toNormalform(true, true);
if (u.endsWith("/")) u = u.substring(0, u.length() - 1);
pos = u.lastIndexOf('/');
while (pos > 8) {
l = u.length();
u = u.substring(0, pos + 1);
h.add(u);
u = u.substring(0, pos);
assert (u.length() < l) : "u = " + u;
pos = u.lastIndexOf('/');
}
} catch (final MalformedURLException e) {}
// now convert the strings to yacyURLs
i = h.iterator();
final HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
while (i.hasNext()) {
u = (String) i.next();
try {
url = new yacyURL(u, null);
v.put(url, "sub");
} catch (final MalformedURLException e) {}
}
return v;
}
public static boolean supportedContent(final yacyURL url, final String mimeType) {
if (url == null) throw new NullPointerException();
if (parserConfig.supportedContent(url, mimeType)) return true;
return false;
}
public static void addParseableMimeTypes(final String configStr) {
parserConfig.addParseableMimeTypes(configStr);
}
public static String[] setEnabledParserList(final Set<String> mimeTypeSet) {
return parserConfig.setEnabledParserList(mimeTypeSet);
}
public static boolean supportedFileExtContains(final String fileExt) {
return parserConfig.supportedFileExtContains(fileExt);
}
public static boolean supportedMimeTypesContains(final String mimeType) {
return parserConfig.supportedMimeTypesContains(mimeType);
}
}

@ -35,14 +35,14 @@ import java.util.Hashtable;
import org.apache.tools.bzip2.CBZip2InputStream;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.yacy.yacyURL;
public class bzipParser extends AbstractParser implements Parser {
public class bzipParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
@ -107,7 +107,7 @@ public class bzipParser extends AbstractParser implements Parser {
checkInterruption();
// creating a new parser class to parse the unzipped content
return ParserDispatcher.parseSource(location,null,null,tempFile);
return Parser.parseSource(location,null,null,tempFile);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;

@ -34,12 +34,12 @@ import org.textmining.extraction.TextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.yacy.yacyURL;
public class docParser extends AbstractParser implements Parser {
public class docParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class

@ -34,14 +34,14 @@ import java.util.Hashtable;
import java.util.zip.GZIPInputStream;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.yacy.yacyURL;
public class gzipParser extends AbstractParser implements Parser {
public class gzipParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
@ -97,7 +97,7 @@ public class gzipParser extends AbstractParser implements Parser {
checkInterruption();
// creating a new parser class to parse the unzipped content
return ParserDispatcher.parseSource(location,null,null,tempFile);
return Parser.parseSource(location,null,null,tempFile);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;

@ -1,3 +1,29 @@
// htmlParser.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 09.07.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
import java.io.IOException;
@ -9,7 +35,7 @@ import java.util.Hashtable;
import de.anomic.document.AbstractParser;
import de.anomic.document.Document;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ScraperInputStream;
@ -17,7 +43,7 @@ import de.anomic.document.parser.html.TransformerWriter;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.yacy.yacyURL;
public class htmlParser extends AbstractParser implements Parser {
public class htmlParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
@ -187,6 +213,7 @@ public class htmlParser extends AbstractParser implements Parser {
return encoding;
}
public Hashtable<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;

@ -41,14 +41,14 @@ import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.yacy.yacyURL;
public class mimeTypeParser extends AbstractParser implements Parser {
public class mimeTypeParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
@ -140,7 +140,7 @@ public class mimeTypeParser extends AbstractParser implements Parser {
checkInterruption();
// parsing the content using the determined mimetype
return ParserDispatcher.parseSource(location,mimeType,charset,sourceFile);
return Parser.parseSource(location,mimeType,charset,sourceFile);
}
throw new ParserException("Unable to detect mimetype of resource (3).",location);
} catch (final MagicMatchNotFoundException e) {

@ -47,7 +47,7 @@ import com.catcode.odf.OpenDocumentTextInputStream;
import de.anomic.crawler.HTTPLoader;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.http.httpClient;
@ -57,7 +57,7 @@ import de.anomic.kelondro.util.FileUtils;
import de.anomic.server.serverCharBuffer;
import de.anomic.yacy.yacyURL;
public class odtParser extends AbstractParser implements Parser {
public class odtParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
@ -109,7 +109,7 @@ public class odtParser extends AbstractParser implements Parser {
final long contentSize = zipEntry.getSize();
// creating a writer for output
if ((contentSize == -1) || (contentSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
if ((contentSize == -1) || (contentSize > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
writerFile = File.createTempFile("odtParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {

@ -43,14 +43,14 @@ import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.pdfbox.util.PDFTextStripper;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.server.serverCharBuffer;
import de.anomic.yacy.yacyURL;
public class pdfParser extends AbstractParser implements Parser {
public class pdfParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
@ -122,7 +122,7 @@ public class pdfParser extends AbstractParser implements Parser {
}
// creating a writer for output
if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
writerFile = File.createTempFile("pdfParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {

@ -34,12 +34,12 @@ import java.util.Hashtable;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.yacy.yacyURL;
public class pptParser extends AbstractParser implements Parser {
public class pptParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class

@ -37,13 +37,13 @@ import java.io.InputStreamReader;
import java.util.Hashtable;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.yacy.yacyURL;
public class psParser extends AbstractParser implements Parser {
public class psParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class

@ -38,7 +38,7 @@ import com.jguild.jrpm.io.datatype.DataTypeIf;
import de.anomic.crawler.HTTPLoader;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.http.httpClient;
@ -51,7 +51,7 @@ import de.anomic.yacy.yacyURL;
* @author theli
*
*/
public class rpmParser extends AbstractParser implements Parser {
public class rpmParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class

@ -39,7 +39,7 @@ import java.util.Map;
import de.anomic.content.RSSMessage;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.document.parser.html.AbstractScraper;
@ -53,7 +53,7 @@ import de.anomic.kelondro.util.FileUtils;
import de.anomic.server.serverCharBuffer;
import de.anomic.yacy.yacyURL;
public class rssParser extends AbstractParser implements Parser {
public class rssParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class

@ -34,12 +34,12 @@ import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.yacy.yacyURL;
public class rtfParser extends AbstractParser implements Parser {
public class rtfParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class

@ -41,8 +41,9 @@ import SevenZip.Archive.IInArchive;
import SevenZip.Archive.SevenZipEntry;
import SevenZip.Archive.SevenZip.Handler;
import de.anomic.document.AbstractParser;
import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.kelondro.util.FileUtils;
@ -50,7 +51,7 @@ import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
public class sevenzipParser extends AbstractParser implements Parser {
public class sevenzipParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
@ -99,14 +100,14 @@ public class sevenzipParser extends AbstractParser implements Parser {
@Override
public Document parse(final yacyURL location, final String mimeType, final String charset,
final byte[] source) throws ParserException, InterruptedException {
return parse(location, mimeType, charset, new ByteArrayIInStream(source), Parser.MAX_KEEP_IN_MEMORY_SIZE - source.length);
return parse(location, mimeType, charset, new ByteArrayIInStream(source), Idiom.MAX_KEEP_IN_MEMORY_SIZE - source.length);
}
@Override
public Document parse(final yacyURL location, final String mimeType, final String charset,
final File sourceFile) throws ParserException, InterruptedException {
try {
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Parser.MAX_KEEP_IN_MEMORY_SIZE);
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Idiom.MAX_KEEP_IN_MEMORY_SIZE);
} catch (final IOException e) {
throw new ParserException("error processing 7zip archive", location, e);
}
@ -115,7 +116,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
public Document parse(final yacyURL location, final String mimeType, final String charset,
final InputStream source) throws ParserException, InterruptedException {
try {
final serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
final serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
FileUtils.copy(source, cfos);
if (cfos.isFallback()) {
return parse(location, mimeType, charset, cfos.getContentFile());
@ -189,11 +190,11 @@ public class sevenzipParser extends AbstractParser implements Parser {
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
final yacyURL url = yacyURL.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = ParserDispatcher.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
final String mime = Classification.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
if (this.cfos.isFallback()) {
theDoc = ParserDispatcher.parseSource(url, mime, null, this.cfos.getContentFile());
theDoc = Parser.parseSource(url, mime, null, this.cfos.getContentFile());
} else {
theDoc = ParserDispatcher.parseSource(url, mime, null, this.cfos.getContentBAOS());
theDoc = Parser.parseSource(url, mime, null, this.cfos.getContentBAOS());
}
this.doc.addSubDocument(theDoc);

@ -33,12 +33,12 @@ import java.util.Hashtable;
import pt.tumba.parser.swf.SWF2HTML;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.yacy.yacyURL;
public class swfParser extends AbstractParser implements Parser {
public class swfParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class

@ -43,8 +43,9 @@ import com.ice.tar.TarEntry;
import com.ice.tar.TarInputStream;
import de.anomic.document.AbstractParser;
import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.document.parser.html.ContentScraper;
@ -53,7 +54,7 @@ import de.anomic.kelondro.util.ByteBuffer;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.yacy.yacyURL;
public class tarParser extends AbstractParser implements Parser {
public class tarParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
@ -85,7 +86,7 @@ public class tarParser extends AbstractParser implements Parser {
File outputFile = null;
Document subDoc = null;
try {
if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
outputFile = File.createTempFile("zipParser",".prt");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
@ -96,7 +97,7 @@ public class tarParser extends AbstractParser implements Parser {
* If the mimeType was not reported correcly by the webserve we
* have to decompress it first
*/
final String ext = ParserDispatcher.getFileExt(location).toLowerCase();
final String ext = Classification.getFileExt(location).toLowerCase();
if (ext.equals("gz") || ext.equals("tgz")) {
source = new GZIPInputStream(source);
}
@ -129,7 +130,7 @@ public class tarParser extends AbstractParser implements Parser {
final String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// trying to determine the mimeType per file extension
final String entryMime = ParserDispatcher.getMimeTypeByFileExt(entryExt);
final String entryMime = Classification.getMimeTypeByFileExt(entryExt);
// getting the entry content
File subDocTempFile = null;
@ -144,7 +145,7 @@ public class tarParser extends AbstractParser implements Parser {
checkInterruption();
// parsing the content
subDoc = ParserDispatcher.parseSource(yacyURL.newURL(location,"#" + entryName),entryMime,null,subDocTempFile);
subDoc = Parser.parseSource(yacyURL.newURL(location,"#" + entryName),entryMime,null,subDocTempFile);
} catch (final ParserException e) {
this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage());
} finally {

@ -39,7 +39,7 @@ import java.util.LinkedList;
import de.anomic.crawler.HTTPLoader;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.http.httpClient;
@ -53,7 +53,7 @@ import de.anomic.yacy.yacyURL;
* @author theli
*
*/
public class vcfParser extends AbstractParser implements Parser {
public class vcfParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class

@ -31,14 +31,14 @@ import java.io.InputStream;
import java.util.Hashtable;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.yacy.yacyURL;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpsf.SummaryInformation;
public class vsdParser extends AbstractParser implements Parser {
public class vsdParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class

@ -40,12 +40,12 @@ import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import de.anomic.document.AbstractParser;
import de.anomic.document.Parser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.yacy.yacyURL;
public class xlsParser extends AbstractParser implements Parser, HSSFListener {
public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
//StringBuilder for parsed text
private StringBuilder sbFoundStrings = null;

@ -41,8 +41,9 @@ import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import de.anomic.document.AbstractParser;
import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.document.parser.html.ContentScraper;
@ -51,7 +52,7 @@ import de.anomic.kelondro.util.ByteBuffer;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.yacy.yacyURL;
public class zipParser extends AbstractParser implements Parser {
public class zipParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
@ -85,7 +86,7 @@ public class zipParser extends AbstractParser implements Parser {
File outputFile = null;
Document subDoc = null;
try {
if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
outputFile = File.createTempFile("zipParser",".prt");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
@ -117,7 +118,7 @@ public class zipParser extends AbstractParser implements Parser {
final String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// trying to determine the mimeType per file extension
final String entryMime = ParserDispatcher.getMimeTypeByFileExt(entryExt);
final String entryMime = Classification.getMimeTypeByFileExt(entryExt);
// parsing the content
File subDocTempFile = null;
@ -129,7 +130,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zippedContent,subDocTempFile,entry.getSize());
// parsing the zip file entry
subDoc = ParserDispatcher.parseSource(yacyURL.newURL(location,"#" + entryName),entryMime,null, subDocTempFile);
subDoc = Parser.parseSource(yacyURL.newURL(location,"#" + entryName),entryMime,null, subDocTempFile);
} catch (final ParserException e) {
this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage());
} finally {

@ -80,7 +80,7 @@ import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
import java.util.zip.GZIPOutputStream;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.Classification;
import de.anomic.document.parser.htmlParser;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ScraperInputStream;
@ -231,7 +231,7 @@ public final class httpdFileHandler {
}
headers.put(httpHeader.SERVER, "AnomicHTTPD (www.anomic.de)");
headers.put(httpHeader.DATE, DateFormatter.formatRFC1123(new Date()));
if(!(ParserDispatcher.mediaExtContains(ext))){
if(!(Classification.mediaExtContains(ext))){
headers.put(httpHeader.PRAGMA, "no-cache");
}
return headers;

@ -73,7 +73,7 @@ import java.util.zip.GZIPOutputStream;
import de.anomic.crawler.HTTPLoader;
import de.anomic.data.Blacklist;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.Classification;
import de.anomic.document.parser.html.ContentTransformer;
import de.anomic.document.parser.html.Transformer;
import de.anomic.kelondro.util.DateFormatter;
@ -528,7 +528,7 @@ public final class httpdProxyHandler {
final String storeError = cacheEntry.shallStoreCacheForProxy();
final boolean storeHTCache = cacheEntry.profile().storeHTCache();
final boolean isSupportedContent = ParserDispatcher.supportedContent(cacheEntry.url(), cacheEntry.getMimeType());
final boolean isSupportedContent = Classification.supportedContent(cacheEntry.url(), cacheEntry.getMimeType());
if (
/*
* Now we store the response into the htcache directory if

@ -26,7 +26,6 @@
package de.anomic.kelondro.order;
import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;
@ -238,7 +237,7 @@ public final class NaturalOrder extends AbstractOrder<byte[]> implements ByteOrd
return sb.toString();
}
public static Iterator<Long> LongIterator(Iterator<byte[]> b256Iterator) throws IOException {
public static Iterator<Long> LongIterator(Iterator<byte[]> b256Iterator) {
return new LongIter(b256Iterator);
}

@ -41,7 +41,7 @@ import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.Classification;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpDocument;
import de.anomic.kelondro.blob.ArrayStack;
@ -181,7 +181,7 @@ public final class plasmaHTCache {
}
public static boolean isText(final String mimeType) {
return ParserDispatcher.supportedMimeTypesContains(mimeType);
return Classification.supportedMimeTypesContains(mimeType);
}
public static boolean noIndexingURL(final yacyURL url) {
@ -200,7 +200,7 @@ public final class plasmaHTCache {
//php
return ParserDispatcher.mediaExtContains(urlString);
return Classification.mediaExtContains(urlString);
}

@ -144,7 +144,8 @@ import de.anomic.data.wiki.wikiBoard;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
import de.anomic.document.Condenser;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Word;
import de.anomic.document.Document;
@ -517,13 +518,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// define an extension-blacklist
log.logConfig("Parser: Initializing Extension Mappings for Media/Parser");
ParserDispatcher.initMediaExt(ParserDispatcher.extString2extList(getConfig(plasmaSwitchboardConstants.PARSER_MEDIA_EXT,"")));
ParserDispatcher.initSupportedHTMLFileExt(ParserDispatcher.extString2extList(getConfig(plasmaSwitchboardConstants.PARSER_MEDIA_EXT_PARSEABLE,"")));
Classification.initMediaExt(Classification.extString2extList(getConfig(plasmaSwitchboardConstants.PARSER_MEDIA_EXT,"")));
Classification.initSupportedHTMLFileExt(Classification.extString2extList(getConfig(plasmaSwitchboardConstants.PARSER_MEDIA_EXT_PARSEABLE,"")));
// define a realtime parsable mimetype list
log.logConfig("Parser: Initializing Mime Types");
ParserDispatcher.initHTMLParsableMimeTypes(getConfig(plasmaSwitchboardConstants.PARSER_MIMETYPES_HTML, "application/xhtml+xml,text/html,text/plain"));
ParserDispatcher.addParseableMimeTypes(getConfig(plasmaSwitchboardConstants.PARSER_MIMETYPES, null));
Classification.initHTMLParsableMimeTypes(getConfig(plasmaSwitchboardConstants.PARSER_MIMETYPES_HTML, "application/xhtml+xml,text/html,text/plain"));
Classification.addParseableMimeTypes(getConfig(plasmaSwitchboardConstants.PARSER_MIMETYPES, null));
// start a loader
log.logConfig("Starting Crawl Loader");
@ -1097,7 +1098,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
*
* Testing if the content type is supported by the available parsers
* ========================================================================= */
final boolean isSupportedContent = ParserDispatcher.supportedContent(entry.url(),entry.getMimeType());
final boolean isSupportedContent = Classification.supportedContent(entry.url(),entry.getMimeType());
if (log.isFinest()) log.logFinest("STORE "+ entry.url() +" content of type "+ entry.getMimeType() +" is supported: "+ isSupportedContent);
/* =========================================================================
@ -1689,7 +1690,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
try {
// parse the document
document = ParserDispatcher.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), plasmaHTCache.getResourceContent(entry.url()));
document = Parser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), plasmaHTCache.getResourceContent(entry.url()));
assert(document != null) : "Unexpected error. Parser returned null.";
} catch (final ParserException e) {
this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());

@ -39,7 +39,8 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.document.Condenser;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Word;
import de.anomic.document.Document;
@ -871,25 +872,25 @@ public class SnippetCache {
if ( // if no extension is available
(p < 0) ||
// or the extension is supported by one of the parsers
((p >= 0) && (ParserDispatcher.supportedFileExtContains(filename.substring(p + 1))))
((p >= 0) && (Classification.supportedFileExtContains(filename.substring(p + 1))))
) {
String supposedMime = "text/html";
// if the mimeType Parser is installed we can set the mimeType to null to force
// a mimetype detection
if (ParserDispatcher.supportedMimeTypesContains("application/octet-stream")) {
if (Classification.supportedMimeTypesContains("application/octet-stream")) {
supposedMime = null;
} else if (p != -1){
// otherwise we try to determine the mimeType per file Extension
supposedMime = ParserDispatcher.getMimeTypeByFileExt(filename.substring(p + 1));
supposedMime = Classification.getMimeTypeByFileExt(filename.substring(p + 1));
}
return ParserDispatcher.parseSource(url, supposedMime, null, contentLength, resourceStream);
return Parser.parseSource(url, supposedMime, null, contentLength, resourceStream);
}
return null;
}
if (ParserDispatcher.supportedMimeTypesContains(responseHeader.mime())) {
return ParserDispatcher.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
if (Classification.supportedMimeTypesContains(responseHeader.mime())) {
return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
}
return null;
} catch (final InterruptedException e) {

@ -58,7 +58,8 @@ import java.util.concurrent.TimeoutException;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
import de.anomic.document.ParserDispatcher;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.kelondro.util.ByteBuffer;
@ -102,8 +103,8 @@ public class mediawikiIndex extends Thread {
this.count = 0;
this.start = 0;
// must be called before usage:
ParserDispatcher.initHTMLParsableMimeTypes("text/html");
ParserDispatcher.addParseableMimeTypes("text/html");
Classification.initHTMLParsableMimeTypes("text/html");
Classification.addParseableMimeTypes("text/html");
}
/**
@ -145,8 +146,8 @@ public class mediawikiIndex extends Thread {
StringBuilder sb = new StringBuilder();
boolean page = false, text = false;
String title = null;
ParserDispatcher.initHTMLParsableMimeTypes("text/html");
ParserDispatcher.addParseableMimeTypes("text/html");
Classification.initHTMLParsableMimeTypes("text/html");
Classification.addParseableMimeTypes("text/html");
wikiparserrecord poison = newRecord();
int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);
@ -487,7 +488,7 @@ public class mediawikiIndex extends Thread {
public void genDocument() throws InterruptedException, ParserException {
try {
url = new yacyURL(urlStub + title, null);
document = ParserDispatcher.parseSource(url, "text/html", "utf-8", html.getBytes("UTF-8"));
document = Parser.parseSource(url, "text/html", "utf-8", html.getBytes("UTF-8"));
// the wiki parser is not able to find the proper title in the source text, so it must be set here
document.setTitle(title);
} catch (UnsupportedEncodingException e) {

@ -28,6 +28,12 @@ package de.anomic.yacy.dht;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.yacy.yacySeed;
/**
* A flat word partition scheme is a metric for words on the range of a distributed
* hash table. The dht is reflected by a 0..Long.MAX_VALUE integer range, each word gets
* a number on that range. To compute a number, the hash representation is used to compute
* the hash position from the first 63 bits of the b64 hash string.
*/
public class FlatWordPartitionScheme implements PartitionScheme {
public static final FlatWordPartitionScheme std = new FlatWordPartitionScheme();

@ -147,12 +147,11 @@ public class ymageChart extends ymageMatrix {
public static void main(final String[] args) {
System.setProperty("java.awt.headless", "true");
final boolean invers = false;
final String bg = (invers) ? "000000" : "FFFFFF";
final String fg = (invers) ? "FFFFFF" : "000000";
final String scale = (invers) ? "333333" : "CCCCCC";
final String green = (invers) ? "008800" : "008800";
final String blue = (invers) ? "0000FF" : "0000FF";
final String bg = "FFFFFF";
final String fg = "000000";
final String scale = "CCCCCC";
final String green = "008800";
final String blue = "0000FF";
final ymageChart ip = new ymageChart(660, 240, bg, fg, fg, 30, 30, 20, 20, "PEER PERFORMANCE GRAPH: PAGES/MINUTE and USED MEMORY", "");
ip.declareDimension(DIMENSION_BOTTOM, 60, 60, -600, fg, scale, "TIME/SECONDS");
//ip.declareDimension(DIMENSION_TOP, 10, 40, "000000", null, "count");

Loading…
Cancel
Save