redesign of parser mime type detection and parser steering

There is now a mime-blacklist instead of a mime-whitelist

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6190 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent e15d27bc63
commit 57a88d435b

@ -247,23 +247,9 @@ releases = DATA/RELEASE
minimumLocalDelta = 0
minimumGlobalDelta = 500
# the following mime-types are the whitelist for indexing
#
# parseableMime: specifies mime-types that can be indexed with any built-in parser
parseableMimeTypes=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd
# parseableMimeTypes.IMAGE: specifies mime-types that refer to image type content
parseableMimeTypes.IMAGE=image/gif,image/jpeg,image/png,image/tiff,image/vnd.wap.wbmp,image/x-icon,image/bmp
# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser
parseableMimeTypes.HTML=application/xhtml+xml,text/html,text/plain,text/sgml
# media extension string
# a comma-separated list of extensions that denote media file formats
# this is important to recognize <a href> - tags as not-html reference
# These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
mediaExt=7z,ace,aif,aiff,arj,asf,asx,avi,bin,bmp,bz2,css,db,dcm,deb,doc,dll,dmg,exe,gif,gz,hqx,ico,img,iso,jar,jpe,jpg,jpeg,lx,lxl,m4v,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,scr,sit,so,swf,sxc,sxd,sxi,sxw,tar,tbz,tgz,torrent,vsd,war,wav,wmv,xcf,xls,zip
parseableExt=html,htm,txt,php,shtml,asp,aspx,jsp
# the following mime-types are a blacklist for indexing:
# parser.mime.deny: specifies mime-types that shall not be indexed
parser.mime.deny=
# Promotion Strings
# These strings appear in the Web Mask of the YACY search client

@ -29,15 +29,13 @@
import java.net.InetSocketAddress;
import java.net.SocketException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpd;
@ -46,6 +44,7 @@ import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Digest;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -458,30 +457,16 @@ public class SettingsAck_p {
if (post.containsKey("parserSettings")) {
post.remove("parserSettings");
final HashSet<String> newConfig = new HashSet<String>();
// loop through all received settings
final Iterator<String> keyEnum = post.keySet().iterator();
while (keyEnum.hasNext()) {
String key = keyEnum.next();
if (key.startsWith("mimename")) newConfig.add(post.get(key));
if (key.startsWith("mimename")) Parser.grantMime(key.substring(9), post.get(key).equals("on"));
}
int enabledMimesCount = 0;
final StringBuilder currEnabledMimesTxt = new StringBuilder();
final String[] enabledMimes = Classification.setEnabledParserList(newConfig);
Arrays.sort(enabledMimes);
currEnabledMimesTxt.setLength(0);
for (int i=0; i < enabledMimes.length; i++) {
currEnabledMimesTxt.append(enabledMimes[i]).append(",");
prop.put("info_parser_" + enabledMimesCount + "_enabledMime", newConfig.toString());
enabledMimesCount++;
}
if (currEnabledMimesTxt.length() > 0) currEnabledMimesTxt.deleteCharAt(currEnabledMimesTxt.length()-1);
env.setConfig("parseableMimeTypes", currEnabledMimesTxt.toString());
env.setConfig(plasmaSwitchboardConstants.PARSER_MIME_DENY, Parser.getDenyMime());
prop.put("info_parser",enabledMimesCount);
prop.put("info_parser", 0);
prop.put("info", "18");
return prop;

@ -24,11 +24,9 @@
// javac -classpath .:../Classes Settings_p.java
// if the shell's current path is HTROOT
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.http.httpHeader;
@ -219,17 +217,15 @@ public final class Settings_p {
*/
int parserIdx = 0;
final Iterator<Idiom> availableParserIter = Parser.availableParserList.values().iterator();
final Iterator<Idiom> availableParserIter = Parser.idioms().iterator();
while (availableParserIter.hasNext()) {
final Idiom parserInfo = availableParserIter.next();
prop.put("parser_" + parserIdx + "_name", parserInfo.getName());
int mimeIdx = 0;
final Enumeration<String> mimeTypeIter = parserInfo.getSupportedMimeTypes().keys();
while (mimeTypeIter.hasMoreElements()) {
final String mimeType = mimeTypeIter.nextElement();
for (String mimeType: parserInfo.getSupportedMimeTypes().keySet()) {
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType);
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (Classification.supportedMimeTypesContains(mimeType)) ? 1 : 0);
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (Parser.supportsMime(mimeType)) ? 1 : 0);
mimeIdx++;
}
prop.put("parser_" + parserIdx + "_mime", mimeIdx);

@ -32,7 +32,7 @@ import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
import de.anomic.http.httpResponseHeader;
@ -218,15 +218,24 @@ public class FTPLoader {
private httpDocument getFile(final ftpc ftpClient, final CrawlEntry entry) throws Exception {
// determine the mimetype of the resource
final yacyURL entryUrl = entry.url();
final String extension = Classification.getFileExt(entryUrl);
final String mimeType = Classification.getMimeTypeByFileExt(extension);
final String mimeType = Parser.mimeOf(entryUrl);
final String path = getPath(entryUrl);
// if the mimetype and file extension is supported we start to download
// the file
httpDocument htCache = null;
if (Classification.supportedContent(entryUrl, mimeType)) {
// aborting download if content is too long
if (!Parser.supportsExtension(entryUrl)) {
// if the response has not the right file type then reject file
log.logInfo("REJECTED WRONG EXTENSION TYPE " + mimeType + " for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong extension");
throw new Exception("response has not the right extension type -> rejected");
} else if (!Parser.supportsMime(mimeType)) {
// if the response has not the right file type then reject file
log.logInfo("REJECTED WRONG MIME TYPE " + mimeType + " for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong mime type");
throw new Exception("response has not the right mime type -> rejected");
} else {
// abort the download if content is too long
final int size = ftpClient.fileSize(path);
if (size <= maxFileSize || maxFileSize == -1) {
// timeout for download
@ -246,11 +255,6 @@ public class FTPLoader {
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new Exception("file size exceeds limit");
}
} else {
// if the response has not the right file type then reject file
log.logInfo("REJECTED WRONG MIME/EXT TYPE " + mimeType + " for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
throw new Exception("response has not the right file type -> rejected");
}
return htCache;
}

@ -29,7 +29,7 @@ import java.io.IOException;
import java.util.Date;
import de.anomic.data.Blacklist;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;
import de.anomic.http.httpResponse;
@ -156,8 +156,15 @@ public final class HTTPLoader {
// request has been placed and result has been returned. work off response
//try {
if (Classification.supportedContent(entry.url(), res.getResponseHeader().mime())) {
if (!Parser.supportsExtension(entry.url())) {
// if the response has not the right file type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong extension");
throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString());
} else if (!Parser.supportsMime(res.getResponseHeader().mime())) {
// if the response has not the right file type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type");
throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
} else {
// get the content length and check if the length is allowed
long contentLength = res.getResponseHeader().getContentLength();
if (maxFileSize >= 0 && contentLength > maxFileSize) {
@ -177,10 +184,6 @@ public final class HTTPLoader {
}
htCache.setCacheArray(responseBody);
} else {
// if the response has not the right file type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
throw new IOException("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
}
return htCache;
/*

@ -54,7 +54,7 @@ public abstract class AbstractParser implements Idiom {
/**
* Parser name
*/
protected String parserName = this.getClass().getName();
private String parserName;
/**
* The source file file size in bytes if the source document was passed
@ -65,7 +65,7 @@ public abstract class AbstractParser implements Idiom {
/**
* The Constructor of this class.
*/
public AbstractParser() {
public AbstractParser(String name) {
super();
}
@ -125,10 +125,7 @@ public abstract class AbstractParser implements Idiom {
// XXX: workaround for relative paths within document
+ file.getPath().substring(file.getPath().indexOf(File.separatorChar) + 1)
+ "/" + file.getName());
final Document subdoc = Parser.parseSource(
url,
Classification.getMimeTypeByFileExt(files[i].substring(files[i].indexOf('.') + 1)),
null, file);
final Document subdoc = Parser.parseSource(url, Parser.mimeOf(url), null, file);
// TODO: change anchors back to use '#' after archive name
doc.addSubDocument(subdoc);
subdoc.close();

@ -2,8 +2,6 @@
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 09.07.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
// $LastChangedRevision: 5736 $
// $LastChangedBy: borg-0300 $
@ -30,38 +28,25 @@ import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
public class Classification {
public static final HashSet<String> supportedHTMLFileExt = new HashSet<String>();
public static final HashSet<String> supportedHTMLMimeTypes = new HashSet<String>();
private static final HashSet<String> mediaExtSet = new HashSet<String>();
private static final HashSet<String> imageExtSet = new HashSet<String>();
private static final HashSet<String> audioExtSet = new HashSet<String>();
private static final HashSet<String> videoExtSet = new HashSet<String>();
private static final HashSet<String> appsExtSet = new HashSet<String>();
private static final Properties mimeTypeLookupByFileExt = new Properties();
public final static HashSet<String> enabledParserList = new HashSet<String>();
private final static HashSet<String> supportedFileExt = new HashSet<String>();
private static final Properties ext2mime = new Properties();
static {
// load a list of extensions from file
BufferedInputStream bufferedIn = null;
try {
mimeTypeLookupByFileExt.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime"))));
ext2mime.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime"))));
} catch (final IOException e) {
System.err.println("ERROR: httpd.mime not found in settings path");
} finally {
@ -70,219 +55,46 @@ public class Classification {
} catch (final Exception e) {}
}
final String apps = "sit,hqx,img,dmg,exe,com,bat,sh,vbs,zip,jar";
final String audio = "mp2,mp3,ogg,aac,aif,aiff,wav";
final String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v";
final String image = "jpg,jpeg,jpe,gif,png,ico,bmp";
final String apps = "7z,ace,arc,arj,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip";
final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,wav,wma";
final String video = "3g2,3gp,3gp2,3gpp,3gpp2,3ivx,asf,asx,avi,div,divx,dv,dvx,env,f4v,flv,hdmov,m1v,m4v,m-jpeg,moov,mov,movie,mp2v,mp4,mpe,mpeg,mpg,mpg4,mv4,ogm,ogv,qt,rm,rv,vid,swf,wmv";
final String image = "ai,bmp,cdr,cmx,emf,eps,gif,img,jpeg,jpg,mng,pct,pdd,pdn,pict,png,psb,psd,psp,tif,tiff,wmf";
imageExtSet.addAll(extString2extList(image)); // image formats
audioExtSet.addAll(extString2extList(audio)); // audio formats
videoExtSet.addAll(extString2extList(video)); // video formats
appsExtSet.addAll(extString2extList(apps)); // application formats
initMediaExt(extString2extList(apps + "," + // application container
"tar,gz,bz2,arj,zip,rar," + // archive formats
"ps,xls,ppt,asf," + // text formats without support
audio + "," + // audio formats
video + "," + // video formats
image // image formats
));
addSet(imageExtSet, image); // image formats
addSet(audioExtSet, audio); // audio formats
addSet(videoExtSet, video); // video formats
addSet(appsExtSet, apps); // application formats
addSet(mediaExtSet, apps + "," + audio + "," + video + "," + image); // all media formats
}
public static List<String> extString2extList(final String extString) {
final LinkedList<String> extensions = new LinkedList<String>();
if ((extString == null) || (extString.length() == 0)) {
return extensions;
}
final String[] xs = extString.split(",");
for (int i = 0; i < xs.length; i++)
extensions.add(xs[i].toLowerCase().trim());
return extensions;
private static void addSet(Set<String> set, final String extString) {
if ((extString == null) || (extString.length() == 0)) return;
for (String s: extString.split(",")) set.add(s.toLowerCase().trim());
}
public static void initMediaExt(final List<String> mediaExtList) {
mediaExtSet.addAll(mediaExtList);
}
public static boolean mediaExtContains(String mediaExt) {
public static boolean isMediaExtension(String mediaExt) {
if (mediaExt == null) return false;
mediaExt = mediaExt.trim().toLowerCase();
if (supportedHTMLFileExt.contains(mediaExt)) return false;
if (supportedFileExtContains(mediaExt)) return false;
return mediaExtSet.contains(mediaExt);
return mediaExtSet.contains(mediaExt.trim().toLowerCase());
}
public static boolean imageExtContains(final String imageExt) {
public static boolean isImageExtension(final String imageExt) {
if (imageExt == null) return false;
return imageExtSet.contains(imageExt.trim().toLowerCase());
}
public static boolean audioExtContains(final String audioExt) {
public static boolean isAudioExtension(final String audioExt) {
if (audioExt == null) return false;
return audioExtSet.contains(audioExt.trim().toLowerCase());
}
public static boolean videoExtContains(final String videoExt) {
public static boolean isVideoExtension(final String videoExt) {
if (videoExt == null) return false;
return videoExtSet.contains(videoExt.trim().toLowerCase());
}
public static boolean appsExtContains(final String appsExt) {
public static boolean isApplicationExtension(final String appsExt) {
if (appsExt == null) return false;
return appsExtSet.contains(appsExt.trim().toLowerCase());
}
public static void initHTMLParsableMimeTypes(
final String htmlParsableMimeTypes) {
final LinkedList<String> mimeTypes = new LinkedList<String>();
if ((htmlParsableMimeTypes == null) || (htmlParsableMimeTypes.length() == 0)) {
return;
}
final String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes
.split(",");
for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) {
mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim());
}
supportedHTMLMimeTypes.addAll(mimeTypes);
}
public static String normalizeMimeType(String mimeType) {
// if (mimeType == null) doMimeTypeAnalysis
if (mimeType == null) mimeType = "application/octet-stream";
mimeType = mimeType.trim().toLowerCase();
final int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
}
public static String getMimeTypeByFileExt(final String fileExt) {
return mimeTypeLookupByFileExt.getProperty(fileExt, "application/octet-stream");
}
public static void initSupportedHTMLFileExt(final List<String> supportedRealtimeFileExtList) {
supportedHTMLFileExt.addAll(supportedRealtimeFileExtList);
}
static boolean HTMLParsableMimeTypesContains(String mimeType) {
mimeType = normalizeMimeType(mimeType);
return supportedHTMLMimeTypes.contains(mimeType);
}
public static boolean supportedContent(final yacyURL url, String mimeType) {
mimeType = Classification.normalizeMimeType(mimeType);
if (
mimeType.equals("text/html") ||
mimeType.equals("application/xhtml+xml") ||
mimeType.equals("text/plain")
) {
return supportedMimeTypesContains(mimeType);
}
return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
}
public static boolean supportedMimeTypesContains(String mimeType) {
mimeType = Classification.normalizeMimeType(mimeType);
if (Classification.supportedHTMLMimeTypes.contains(mimeType)) return true;
return enabledParserList.contains(mimeType);
}
private static boolean supportedFileExt(final yacyURL url) {
if (url == null) throw new NullPointerException();
// getting the file path
final String name = getFileExt(url);
return supportedFileExtContains(name);
}
public static boolean supportedFileExtContains(String fileExt) {
if (fileExt == null) return false;
fileExt = fileExt.trim().toLowerCase();
if (Classification.supportedHTMLFileExt.contains(fileExt)) return true;
return supportedFileExt.contains(fileExt);
}
public static void addParseableMimeTypes(final String enabledMimeTypes) {
HashSet<String> mimeTypes = null;
if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) {
mimeTypes = new HashSet<String>();
} else {
final String[] enabledMimeTypeList = enabledMimeTypes.split(",");
mimeTypes = new HashSet<String>(enabledMimeTypeList.length);
for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim());
}
setEnabledParserList(mimeTypes);
}
public static void enableAllParsers() {
final Set<String> availableMimeTypes = Parser.availableParserList.keySet();
setEnabledParserList(availableMimeTypes);
}
public static String[] setEnabledParserList(final Set<String> mimeTypeSet) {
final HashSet<String> newEnabledParsers = new HashSet<String>();
final HashSet<String> newSupportedFileExt = new HashSet<String>();
if (mimeTypeSet != null) {
final Iterator<String> mimeTypes = mimeTypeSet.iterator();
while (mimeTypes.hasNext()) {
final String mimeType = mimeTypes.next();
Idiom theParser = Parser.availableParserList.get(mimeType);
if (theParser != null) {
try {
// getting a list of mimeTypes that the parser supports
final Hashtable<String, String> parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
if (parserSupportsMimeTypes != null) {
final Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
if ((supportedExtensions != null) &&
(supportedExtensions instanceof String) &&
(((String)supportedExtensions).length() > 0)) {
final String[] extArray = ((String)supportedExtensions).split(",");
newSupportedFileExt.addAll(Arrays.asList(extArray));
}
}
newEnabledParsers.add(mimeType);
} catch (final Exception e) {
Log.logSevere("PARSER", "error in setEnabledParserList", e);
} finally {
if (theParser != null)
theParser = null; // destroy object
}
}
}
}
enabledParserList.addAll(newEnabledParsers);
supportedFileExt.addAll(newSupportedFileExt);
return newEnabledParsers.toArray(new String[newEnabledParsers.size()]);
}
@SuppressWarnings("unchecked")
public static HashSet<String> getEnabledParserList() {
return (HashSet<String>) enabledParserList.clone();
}
public static String getFileExt(final yacyURL url) {
// getting the file path
String name = url.getPath();
// tetermining last position of / in the file path
int p = name.lastIndexOf('/');
if (p != -1) {
name = name.substring(p);
}
// termining last position of . in file path
p = name.lastIndexOf('.');
if (p < 0)
return "";
return name.substring(p + 1);
}
}

@ -374,14 +374,14 @@ dc_rights
} else {
ext = u.substring(extpos + 1).toLowerCase();
}
if (Classification.mediaExtContains(ext)) {
if (Classification.isMediaExtension(ext)) {
// this is not a normal anchor, its a media link
if (Classification.imageExtContains(ext)) {
if (Classification.isImageExtension(ext)) {
ContentScraper.addImage(collectedImages, new ImageEntry(url, entry.getValue(), -1, -1));
}
else if (Classification.audioExtContains(ext)) audiolinks.put(url, entry.getValue());
else if (Classification.videoExtContains(ext)) videolinks.put(url, entry.getValue());
else if (Classification.appsExtContains(ext)) applinks.put(url, entry.getValue());
else if (Classification.isAudioExtension(ext)) audiolinks.put(url, entry.getValue());
else if (Classification.isVideoExtension(ext)) videolinks.put(url, entry.getValue());
else if (Classification.isApplicationExtension(ext)) applinks.put(url, entry.getValue());
} else {
hyperlinks.put(url, entry.getValue());
}

@ -27,6 +27,7 @@ package de.anomic.document;
import java.io.File;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Hashtable;
import de.anomic.yacy.yacyURL;
@ -85,11 +86,12 @@ public interface Idiom {
throws ParserException, InterruptedException;
/**
* Can be used to determine the MimeType(s) that are supported by the parser
* @return a {@link Hashtable} containing a list of MimeTypes that are supported by
* the parser
* Get the MimeType(s) that are supported by the parser
* @return a {@link Hashtable} containing a mapping from a mime type string
* to a comma-separated String of file extensions
* that are supported by the idiom parser
*/
public Hashtable<String, String> getSupportedMimeTypes();
public HashMap<String, String> getSupportedMimeTypes();
/**
* This function should be called before reusing the parser object.

@ -31,9 +31,13 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.text.Collator;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.document.parser.bzipParser;
import de.anomic.document.parser.docParser;
@ -59,13 +63,24 @@ import de.anomic.yacy.logging.Log;
public final class Parser {
private static final Log theLogger = new Log("PARSER");
public static final HashMap<String, Idiom> availableParserList = new HashMap<String, Idiom>();
private static final Log log = new Log("PARSER");
// use a collator to relax when distinguishing between lowercase und uppercase letters
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
static {
insensitiveCollator.setStrength(Collator.SECONDARY);
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
}
private static final Map<String, Idiom> mime2parser = new TreeMap<String, Idiom>(insensitiveCollator);
private static final Map<String, Set<String>> ext2mime = new TreeMap<String, Set<String>>(insensitiveCollator);
private static final Set<String> denyMime = new TreeSet<String>(insensitiveCollator);
static {
initParser(new bzipParser());
initParser(new docParser());
initParser(new gzipParser());
initParser(new htmlParser());
initParser(new mimeTypeParser());
initParser(new odtParser());
initParser(new pdfParser());
@ -82,14 +97,30 @@ public final class Parser {
initParser(new xlsParser());
initParser(new zipParser());
}
public static Set<Idiom> idioms() {
Set<Idiom> c = new HashSet<Idiom>();
c.addAll(mime2parser.values());
return c;
}
private static void initParser(Idiom parser) {
for (Map.Entry<String, String> e: parser.getSupportedMimeTypes().entrySet()) {
// process the mime types
final String mimeType = e.getKey();
Idiom p0 = mime2parser.get(mimeType);
if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser.");
mime2parser.put(mimeType, parser);
Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());
private static void initParser(Idiom theParser) {
final Hashtable<String, String> supportedMimeTypes = theParser.getSupportedMimeTypes();
final Iterator<String> mimeTypeIterator = supportedMimeTypes.keySet().iterator();
while (mimeTypeIterator.hasNext()) {
final String mimeType = mimeTypeIterator.next();
availableParserList.put(mimeType, theParser);
Log.logInfo("PARSER", "Found parser for mimeType '" + mimeType + "': " + theParser.getName());
// process the extensions
String[] exts = e.getValue().split(",");
for (String ext: exts) {
Set<String> s = ext2mime.get(ext);
if (s == null) s = new HashSet<String>();
s.add(mimeType);
ext2mime.put(ext, s);
}
}
}
@ -99,10 +130,10 @@ public final class Parser {
ParserException {
ByteArrayInputStream byteIn = null;
try {
if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from byte-array");
if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
if (sourceArray == null || sourceArray.length == 0) {
final String errorMsg = "No resource content available (1) " + (((sourceArray == null) ? "source == null" : "source.length() == 0") + ", url = " + location.toNormalform(true, false));
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, errorMsg);
}
byteIn = new ByteArrayInputStream(sourceArray);
@ -110,7 +141,7 @@ public final class Parser {
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
log.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location, location, e);
} finally {
if (byteIn != null) try {
@ -125,10 +156,10 @@ public final class Parser {
BufferedInputStream sourceStream = null;
try {
if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from file");
if (log.isFine()) log.logFine("Parsing '" + location + "' from file");
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2).";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "document has no content");
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
@ -136,7 +167,7 @@ public final class Parser {
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location, location, e);
} finally {
if (sourceStream != null)try {
@ -150,31 +181,34 @@ public final class Parser {
final long contentLength, final InputStream sourceStream)
throws InterruptedException, ParserException {
try {
if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from stream");
mimeType = Classification.normalizeMimeType(mimeType);
final String fileExt = Classification.getFileExt(location);
if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType);
final String fileExt = location.getFileExtension();
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
if (!Classification.supportedContent(location, mimeType)) {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (1)";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong mime type or wrong extension");
if (!supportsMime(mimeType)) {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong mime type");
}
if (!supportsExtension(location)) {
final String errorMsg = "No parser available to parse extension of url path";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong extension");
}
if (theLogger.isFine()) theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
Idiom parser = availableParserList.get(Classification.normalizeMimeType(mimeType));
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
Idiom parser = mime2parser.get(normalizeMimeType(mimeType));
Document doc = null;
if (parser != null) {
parser.setContentLength(contentLength);
doc = parser.parse(location, mimeType, documentCharset, sourceStream);
} else if (Classification.HTMLParsableMimeTypesContains(mimeType)) {
doc = new htmlParser().parse(location, mimeType, documentCharset, sourceStream);
} else {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong mime type or wrong extension");
}
if (doc == null) {
final String errorMsg = "Unexpected error. Parser returned null.";
theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
}
return doc;
@ -182,9 +216,50 @@ public final class Parser {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
final String errorMsg = "Unexpected exception. " + e.getMessage();
theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
log.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
throw new ParserException(errorMsg, location, e);
}
}
public static boolean supportsMime(String mimeType) {
return !denyMime.contains(mimeType) && mime2parser.containsKey(normalizeMimeType(mimeType));
}
public static boolean supportsExtension(final yacyURL url) {
String ext = url.getFileExtension();
if (ext.length() == 0) return true; // may be anything; thats ok if the mime type is ok
return ext2mime.containsKey(ext);
}
public static String mimeOf(yacyURL url) {
return mimeOf(url.getFileExtension());
}
public static String mimeOf(String ext) {
Set<String> mimes = ext2mime.get(ext);
if (mimes == null) return null;
return mimes.iterator().next();
}
private static String normalizeMimeType(String mimeType) {
if (mimeType == null) return "application/octet-stream";
final int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
}
public static void setDenyMime(String denyList) {
denyMime.clear();
for (String s: denyList.split(",")) denyMime.add(s);
}
public static String getDenyMime() {
String s = "";
for (String d: denyMime) s += d + ",";
s = s.substring(0, s.length() - 1);
return s;
}
public static void grantMime(String mime, boolean grant) {
if (grant) denyMime.remove(mime); else denyMime.add(mime);
}
}

@ -30,8 +30,7 @@ package de.anomic.document.parser;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.HashMap;
import org.apache.tools.bzip2.CBZip2InputStream;
import de.anomic.document.AbstractParser;
@ -48,7 +47,7 @@ public class bzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static final String fileExtensions = "bz2,tbz,tbz2";
static {
SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions);
@ -61,11 +60,10 @@ public class bzipParser extends AbstractParser implements Idiom {
}
public bzipParser() {
super();
this.parserName = "Bzip 2 UNIX Compressed File Parser";
super("Bzip 2 UNIX Compressed File Parser");
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -28,8 +28,7 @@
package de.anomic.document.parser;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.HashMap;
import org.textmining.extraction.TextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory;
@ -45,22 +44,22 @@ public class docParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/msword","doc");
SUPPORTED_MIME_TYPES.put("application/doc","doc");
SUPPORTED_MIME_TYPES.put("appl/text","doc");
SUPPORTED_MIME_TYPES.put("application/vnd.msword","doc");
SUPPORTED_MIME_TYPES.put("application/vnd.ms-word","doc");
SUPPORTED_MIME_TYPES.put("application/winword","doc");
SUPPORTED_MIME_TYPES.put("application/word","doc");
SUPPORTED_MIME_TYPES.put("application/x-msw6","doc");
SUPPORTED_MIME_TYPES.put("application/x-msword","doc");
String ext = "doc,docx";
SUPPORTED_MIME_TYPES.put("application/msword",ext);
SUPPORTED_MIME_TYPES.put("application/doc",ext);
SUPPORTED_MIME_TYPES.put("appl/text",ext);
SUPPORTED_MIME_TYPES.put("application/vnd.msword",ext);
SUPPORTED_MIME_TYPES.put("application/vnd.ms-word",ext);
SUPPORTED_MIME_TYPES.put("application/winword",ext);
SUPPORTED_MIME_TYPES.put("application/word",ext);
SUPPORTED_MIME_TYPES.put("application/x-msw6",ext);
SUPPORTED_MIME_TYPES.put("application/x-msword",ext);
}
public docParser() {
super();
this.parserName = "Word Document Parser";
super("Word Document Parser");
}
public Document parse(final yacyURL location, final String mimeType, final String charset,
@ -103,7 +102,7 @@ public class docParser extends AbstractParser implements Idiom {
}
}
public java.util.Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return docParser.SUPPORTED_MIME_TYPES;
}

@ -30,7 +30,7 @@ package de.anomic.document.parser;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.HashMap;
import java.util.zip.GZIPInputStream;
import de.anomic.document.AbstractParser;
@ -47,27 +47,26 @@ public class gzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
static final String fileExtensions = "gz,tgz";
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static final String ext = "gz,tgz";
static {
SUPPORTED_MIME_TYPES.put("application/x-gzip",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/gzip",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-gunzip",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/gzipped",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/gzip-compressed",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-compressed",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-compress",fileExtensions);
SUPPORTED_MIME_TYPES.put("gzip/document",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/octet-stream",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-tar",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-gzip",ext);
SUPPORTED_MIME_TYPES.put("application/gzip",ext);
SUPPORTED_MIME_TYPES.put("application/x-gunzip",ext);
SUPPORTED_MIME_TYPES.put("application/gzipped",ext);
SUPPORTED_MIME_TYPES.put("application/gzip-compressed",ext);
SUPPORTED_MIME_TYPES.put("application/x-compressed",ext);
SUPPORTED_MIME_TYPES.put("application/x-compress",ext);
SUPPORTED_MIME_TYPES.put("gzip/document",ext);
SUPPORTED_MIME_TYPES.put("application/octet-stream",ext);
SUPPORTED_MIME_TYPES.put("application/x-tar",ext);
}
public gzipParser() {
super();
this.parserName = "GNU Zip Compressed Archive Parser";
super("GNU Zip Compressed Archive Parser");
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -31,8 +31,7 @@ import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Hashtable;
import java.util.HashMap;
import de.anomic.document.AbstractParser;
import de.anomic.document.Document;
import de.anomic.document.Idiom;
@ -49,17 +48,17 @@ public class htmlParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/xhtml+xml","htm,html,xhtml,php,asp");
SUPPORTED_MIME_TYPES.put("text/html","htm,html,xhtml,php,asp");
SUPPORTED_MIME_TYPES.put("text/plain","htm,html,xhtml,php,asp,txt");
SUPPORTED_MIME_TYPES.put("text/sgml","htm,html,xhtml,php,asp,xml");
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp";
SUPPORTED_MIME_TYPES.put("application/xhtml+xml", ext);
SUPPORTED_MIME_TYPES.put("text/html", ext);
SUPPORTED_MIME_TYPES.put("text/plain", ext);
SUPPORTED_MIME_TYPES.put("text/sgml",ext);
}
public htmlParser() {
super();
this.parserName = "streaming html parser";
super("streaming html parser");
}
@Override
@ -215,7 +214,7 @@ public class htmlParser extends AbstractParser implements Idiom {
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -31,6 +31,7 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.Hashtable;
import net.sf.jmimemagic.Magic;
@ -54,14 +55,14 @@ public class mimeTypeParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("text/xml","xml");
SUPPORTED_MIME_TYPES.put("application/xml","xml");
SUPPORTED_MIME_TYPES.put("application/x-xml","xml");
SUPPORTED_MIME_TYPES.put("application/octet-stream","");
SUPPORTED_MIME_TYPES.put("application/x-compress","");
SUPPORTED_MIME_TYPES.put("application/x-compressed","");
SUPPORTED_MIME_TYPES.put("application/x-xml","xml");
SUPPORTED_MIME_TYPES.put("application/octet-stream","xml");
SUPPORTED_MIME_TYPES.put("application/x-compress","xml");
SUPPORTED_MIME_TYPES.put("application/x-compressed","xml");
}
/**
@ -71,8 +72,7 @@ public class mimeTypeParser extends AbstractParser implements Idiom {
private static Hashtable<Thread, Integer> threadLoopDetection = new Hashtable<Thread, Integer>();
public mimeTypeParser() {
super();
this.parserName = "MimeType Parser";
super("MimeType Parser");
}
@SuppressWarnings("unchecked")
@ -174,7 +174,7 @@ public class mimeTypeParser extends AbstractParser implements Idiom {
}
public java.util.Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return mimeTypeParser.SUPPORTED_MIME_TYPES;
}

@ -35,8 +35,8 @@ import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
@ -63,18 +63,17 @@ public class odtParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt");
SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt");
}
public odtParser() {
super();
this.parserName = "OASIS OpenDocument V2 Text Document Parser";
super("OASIS OpenDocument V2 Text Document Parser");
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -33,8 +33,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Hashtable;
import java.util.HashMap;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
@ -56,7 +55,7 @@ public class pdfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/pdf","pdf");
SUPPORTED_MIME_TYPES.put("application/x-pdf","pdf");
@ -67,11 +66,10 @@ public class pdfParser extends AbstractParser implements Idiom {
}
public pdfParser() {
super();
this.parserName = "Acrobat Portable Document Parser";
super("Acrobat Portable Document Parser");
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -29,8 +29,7 @@ package de.anomic.document.parser;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.HashMap;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import de.anomic.document.AbstractParser;
@ -45,22 +44,21 @@ public class pptParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
static final String fileExtensions = "ppt,pps";
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static final String ext = "ppt,pps";
static {
SUPPORTED_MIME_TYPES.put("application/mspowerpoint",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/powerpoint",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/ms-powerpoint",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/mspowerpnt",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/vnd-mspowerpoint",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-powerpoint",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-m",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/mspowerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/powerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/ms-powerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/mspowerpnt",ext);
SUPPORTED_MIME_TYPES.put("application/vnd-mspowerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/x-powerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/x-m",ext);
}
public pptParser(){
super();
this.parserName = "Microsoft Powerpoint Parser";
super("Microsoft Powerpoint Parser");
}
/*
@ -116,7 +114,7 @@ public class pptParser extends AbstractParser implements Idiom {
}
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -34,8 +34,7 @@ import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Hashtable;
import java.util.HashMap;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
@ -49,7 +48,7 @@ public class psParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/ps","ps");
SUPPORTED_MIME_TYPES.put("application/x-postscript","ps");
@ -62,8 +61,7 @@ public class psParser extends AbstractParser implements Idiom {
private static String parserMode = "java";
public psParser() {
super();
this.parserName = "PostScript Document Parser";
super("PostScript Document Parser");
if (!modeScanDone) synchronized (modeScan) {
if (testForPs2Ascii()) parserMode = "ps2ascii";
else parserMode = "java";
@ -71,7 +69,7 @@ public class psParser extends AbstractParser implements Idiom {
}
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -31,8 +31,6 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Hashtable;
import com.jguild.jrpm.io.RPMFile;
import com.jguild.jrpm.io.datatype.DataTypeIf;
@ -57,7 +55,7 @@ public class rpmParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/x-rpm","rpm");
SUPPORTED_MIME_TYPES.put("application/x-redhat packet manager","rpm");
@ -65,11 +63,10 @@ public class rpmParser extends AbstractParser implements Idiom {
}
public rpmParser() {
super();
this.parserName = "rpm Parser";
super("rpm Parser");
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -33,7 +33,6 @@ import java.io.InputStream;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
@ -59,7 +58,7 @@ public class rssParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static final String fileExtensions = "xml,rss,rdf";
static {
SUPPORTED_MIME_TYPES.put("text/rss",fileExtensions);
@ -69,8 +68,7 @@ public class rssParser extends AbstractParser implements Idiom {
}
public rssParser() {
super();
this.parserName = "Rich Site Summary/Atom Feed Parser";
super("Rich Site Summary/Atom Feed Parser");
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
@ -176,7 +174,7 @@ public class rssParser extends AbstractParser implements Idiom {
}
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -28,8 +28,7 @@
package de.anomic.document.parser;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.HashMap;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
@ -45,7 +44,7 @@ public class rtfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/rtf","rtf");
SUPPORTED_MIME_TYPES.put("text/rtf","rtf");
@ -57,8 +56,7 @@ public class rtfParser extends AbstractParser implements Idiom {
}
public rtfParser() {
super();
this.parserName = "Rich Text Format Parser";
super("Rich Text Format Parser");
}
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
@ -100,7 +98,7 @@ public class rtfParser extends AbstractParser implements Idiom {
}
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return rtfParser.SUPPORTED_MIME_TYPES;
}

@ -32,8 +32,7 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Hashtable;
import java.util.HashMap;
import SevenZip.ArchiveExtractCallback;
import SevenZip.IInStream;
import SevenZip.MyRandomAccessFile;
@ -41,7 +40,6 @@ import SevenZip.Archive.IInArchive;
import SevenZip.Archive.SevenZipEntry;
import SevenZip.Archive.SevenZip.Handler;
import de.anomic.document.AbstractParser;
import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
@ -57,14 +55,13 @@ public class sevenzipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z");
}
public sevenzipParser() {
super();
super.parserName = "7zip Archive Parser";
super("7zip Archive Parser");
}
public Document parse(final yacyURL location, final String mimeType, final String charset,
@ -127,7 +124,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
}
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@ -190,7 +187,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
// workaround for relative links in file, normally '#' shall be used behind the location, see
// below for reversion of the effects
final yacyURL url = yacyURL.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = Classification.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
final String mime = Parser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
if (this.cfos.isFallback()) {
theDoc = Parser.parseSource(url, mime, null, this.cfos.getContentFile());
} else {

@ -29,8 +29,6 @@ package de.anomic.document.parser;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Hashtable;
import pt.tumba.parser.swf.SWF2HTML;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
@ -44,7 +42,7 @@ public class swfParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash","swf");
SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash2-preview","swf");
@ -53,14 +51,13 @@ public class swfParser extends AbstractParser implements Idiom {
}
public swfParser() {
super();
this.parserName = "Adobe Flash Parser";
super("Adobe Flash Parser");
}
/**
* returns a hashtable containing the mimetypes that are supported by this class
*/
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -34,7 +34,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
import java.util.zip.GZIPInputStream;
@ -43,7 +42,6 @@ import com.ice.tar.TarEntry;
import com.ice.tar.TarInputStream;
import de.anomic.document.AbstractParser;
import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
@ -60,7 +58,7 @@ public class tarParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/x-tar","tar");
SUPPORTED_MIME_TYPES.put("application/tar","tar");
@ -71,11 +69,10 @@ public class tarParser extends AbstractParser implements Idiom {
}
public tarParser() {
super();
this.parserName = "Tape Archive File Parser";
super("Tape Archive File Parser");
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@ -97,7 +94,7 @@ public class tarParser extends AbstractParser implements Idiom {
* If the mimeType was not reported correcly by the webserve we
* have to decompress it first
*/
final String ext = Classification.getFileExt(location).toLowerCase();
final String ext = location.getFileExtension().toLowerCase();
if (ext.equals("gz") || ext.equals("tgz")) {
source = new GZIPInputStream(source);
}
@ -130,7 +127,7 @@ public class tarParser extends AbstractParser implements Idiom {
final String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// trying to determine the mimeType per file extension
final String entryMime = Classification.getMimeTypeByFileExt(entryExt);
final String entryMime = Parser.mimeOf(entryExt);
// getting the entry content
File subDocTempFile = null;

@ -33,7 +33,6 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedList;
@ -61,7 +60,7 @@ public class vcfParser extends AbstractParser implements Idiom {
*
* TODO: support of x-mozilla-cpt and x-mozilla-html tags
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("text/x-vcard","vcf");
SUPPORTED_MIME_TYPES.put("application/vcard","vcf");
@ -73,11 +72,10 @@ public class vcfParser extends AbstractParser implements Idiom {
}
public vcfParser() {
super();
this.parserName = "vCard Parser";
super("vCard Parser");
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -28,8 +28,7 @@
package de.anomic.document.parser;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.HashMap;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
@ -44,7 +43,7 @@ public class vsdParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/visio","vsd");
SUPPORTED_MIME_TYPES.put("application/x-visio","vsd");
@ -57,14 +56,13 @@ public class vsdParser extends AbstractParser implements Idiom {
}
public vsdParser() {
super();
this.parserName = "Microsoft Visio Parser";
super("Microsoft Visio Parser");
}
/**
* returns a hashtable containing the mimetypes that are supported by this class
*/
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -28,8 +28,7 @@
package de.anomic.document.parser;
import java.io.InputStream;
import java.util.Hashtable;
import java.util.HashMap;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
@ -57,21 +56,21 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/msexcel","xls");
SUPPORTED_MIME_TYPES.put("application/excel","xls");
SUPPORTED_MIME_TYPES.put("application/vnd.ms-excel","xls");
SUPPORTED_MIME_TYPES.put("application/x-excel","xls");
SUPPORTED_MIME_TYPES.put("application/x-msexcel","xls");
SUPPORTED_MIME_TYPES.put("application/x-ms-excel","xls");
SUPPORTED_MIME_TYPES.put("application/x-dos_ms_excel","xls");
SUPPORTED_MIME_TYPES.put("application/xls","xls");
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
String ext = "xls,xlsx";
SUPPORTED_MIME_TYPES.put("application/msexcel",ext);
SUPPORTED_MIME_TYPES.put("application/excel",ext);
SUPPORTED_MIME_TYPES.put("application/vnd.ms-excel",ext);
SUPPORTED_MIME_TYPES.put("application/x-excel",ext);
SUPPORTED_MIME_TYPES.put("application/x-msexcel",ext);
SUPPORTED_MIME_TYPES.put("application/x-ms-excel",ext);
SUPPORTED_MIME_TYPES.put("application/x-dos_ms_excel",ext);
SUPPORTED_MIME_TYPES.put("application/xls",ext);
}
public xlsParser(){
super();
this.parserName = "Microsoft Excel Parser";
super("Microsoft Excel Parser");
}
/*
@ -135,7 +134,7 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener {
}
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}

@ -34,14 +34,12 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import de.anomic.document.AbstractParser;
import de.anomic.document.Classification;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
@ -58,7 +56,7 @@ public class zipParser extends AbstractParser implements Idiom {
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Hashtable<String, String> SUPPORTED_MIME_TYPES = new Hashtable<String, String>();
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/zip","zip");
SUPPORTED_MIME_TYPES.put("application/x-zip","zip");
@ -71,11 +69,10 @@ public class zipParser extends AbstractParser implements Idiom {
}
public zipParser() {
super();
this.parserName = "Compressed Archive File Parser";
super("Compressed Archive File Parser");
}
public Hashtable<String, String> getSupportedMimeTypes() {
public HashMap<String, String> getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
@ -118,7 +115,7 @@ public class zipParser extends AbstractParser implements Idiom {
final String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// trying to determine the mimeType per file extension
final String entryMime = Classification.getMimeTypeByFileExt(entryExt);
final String entryMime = Parser.mimeOf(entryExt);
// parsing the content
File subDocTempFile = null;

@ -231,7 +231,7 @@ public final class httpdFileHandler {
}
headers.put(httpHeader.SERVER, "AnomicHTTPD (www.anomic.de)");
headers.put(httpHeader.DATE, DateFormatter.formatRFC1123(new Date()));
if(!(Classification.mediaExtContains(ext))){
if(!(Classification.isMediaExtension(ext))){
headers.put(httpHeader.PRAGMA, "no-cache");
}
return headers;

@ -73,7 +73,7 @@ import java.util.zip.GZIPOutputStream;
import de.anomic.crawler.HTTPLoader;
import de.anomic.data.Blacklist;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.document.parser.html.ContentTransformer;
import de.anomic.document.parser.html.Transformer;
import de.anomic.kelondro.util.DateFormatter;
@ -522,13 +522,13 @@ public final class httpdProxyHandler {
res.getStatusLine().substring(4), // status text
responseHeader);
if(hasBody(res.getStatusCode())) {
if (hasBody(res.getStatusCode())) {
final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
final String storeError = cacheEntry.shallStoreCacheForProxy();
final boolean storeHTCache = cacheEntry.profile().storeHTCache();
final boolean isSupportedContent = Classification.supportedContent(cacheEntry.url(), cacheEntry.getMimeType());
final boolean isSupportedContent = Parser.supportsExtension(cacheEntry.url()) && Parser.supportsMime(cacheEntry.getMimeType());
if (
/*
* Now we store the response into the htcache directory if

@ -42,6 +42,7 @@ import java.util.HashMap;
import java.util.Map;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.http.httpResponseHeader;
import de.anomic.http.httpDocument;
import de.anomic.kelondro.blob.ArrayStack;
@ -181,7 +182,7 @@ public final class plasmaHTCache {
}
public static boolean isText(final String mimeType) {
return Classification.supportedMimeTypesContains(mimeType);
return Parser.supportsMime(mimeType);
}
public static boolean noIndexingURL(final yacyURL url) {
@ -200,7 +201,7 @@ public final class plasmaHTCache {
//php
return Classification.mediaExtContains(urlString);
return Classification.isMediaExtension(urlString);
}

@ -144,7 +144,6 @@ import de.anomic.data.wiki.wikiBoard;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
import de.anomic.document.Condenser;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Word;
@ -513,18 +512,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
//Init bookmarks DB
initBookmarks();
// make parser
log.logConfig("Starting Parser");
// define an extension-blacklist
log.logConfig("Parser: Initializing Extension Mappings for Media/Parser");
Classification.initMediaExt(Classification.extString2extList(getConfig(plasmaSwitchboardConstants.PARSER_MEDIA_EXT,"")));
Classification.initSupportedHTMLFileExt(Classification.extString2extList(getConfig(plasmaSwitchboardConstants.PARSER_MEDIA_EXT_PARSEABLE,"")));
// define a realtime parsable mimetype list
log.logConfig("Parser: Initializing Mime Types");
Classification.initHTMLParsableMimeTypes(getConfig(plasmaSwitchboardConstants.PARSER_MIMETYPES_HTML, "application/xhtml+xml,text/html,text/plain"));
Classification.addParseableMimeTypes(getConfig(plasmaSwitchboardConstants.PARSER_MIMETYPES, null));
log.logConfig("Parser: Initializing Mime Type deny list");
Parser.setDenyMime(getConfig(plasmaSwitchboardConstants.PARSER_MIME_DENY, null));
// start a loader
log.logConfig("Starting Crawl Loader");
@ -1098,7 +1088,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
*
* Testing if the content type is supported by the available parsers
* ========================================================================= */
final boolean isSupportedContent = Classification.supportedContent(entry.url(),entry.getMimeType());
final boolean isSupportedContent = Parser.supportsExtension(entry.url()) && Parser.supportsMime(entry.getMimeType());
if (log.isFinest()) log.logFinest("STORE "+ entry.url() +" content of type "+ entry.getMimeType() +" is supported: "+ isSupportedContent);
/* =========================================================================

@ -244,11 +244,7 @@ public final class plasmaSwitchboardConstants {
public static final String RANKING_DIST_1_METHOD = "CRDist1Method";
public static final String RANKING_DIST_1_PERCENT = "CRDist1Percent";
public static final String RANKING_DIST_1_TARGET = "CRDist1Target";
public static final String PARSER_MIMETYPES = "parseableMimeTypes";
public static final String PARSER_MIMETYPES_HTML = "parseableMimeTypes.HTML";
public static final String PARSER_MIMETYPES_IMAGE = "parseableMimeTypes.IMAGE";
public static final String PARSER_MEDIA_EXT = "mediaExt";
public static final String PARSER_MEDIA_EXT_PARSEABLE = "parseableExt";
public static final String PARSER_MIME_DENY = "parser.mime.deny";
/**
* <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
* <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>

@ -39,7 +39,6 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.document.Condenser;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Word;
@ -867,29 +866,13 @@ public class SnippetCache {
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
if (responseHeader == null) {
final String filename = url.getFileName();
final int p = filename.lastIndexOf('.');
if ( // if no extension is available
(p < 0) ||
// or the extension is supported by one of the parsers
((p >= 0) && (Classification.supportedFileExtContains(filename.substring(p + 1))))
) {
String supposedMime = "text/html";
// if the mimeType Parser is installed we can set the mimeType to null to force
// a mimetype detection
if (Classification.supportedMimeTypesContains("application/octet-stream")) {
supposedMime = null;
} else if (p != -1){
// otherwise we try to determine the mimeType per file Extension
supposedMime = Classification.getMimeTypeByFileExt(filename.substring(p + 1));
}
if (Parser.supportsExtension(url)) {
String supposedMime = Parser.mimeOf(url);
return Parser.parseSource(url, supposedMime, null, contentLength, resourceStream);
}
return null;
}
if (Classification.supportedMimeTypesContains(responseHeader.mime())) {
if (Parser.supportsMime(responseHeader.mime())) {
return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
}
return null;

@ -58,7 +58,6 @@ import java.util.concurrent.TimeoutException;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
@ -102,9 +101,6 @@ public class mediawikiIndex extends Thread {
this.wparser = new wikiCode(new URL(baseURL).getHost());
this.count = 0;
this.start = 0;
// must be called before usage:
Classification.initHTMLParsableMimeTypes("text/html");
Classification.addParseableMimeTypes("text/html");
}
/**
@ -146,8 +142,6 @@ public class mediawikiIndex extends Thread {
StringBuilder sb = new StringBuilder();
boolean page = false, text = false;
String title = null;
Classification.initHTMLParsableMimeTypes("text/html");
Classification.addParseableMimeTypes("text/html");
wikiparserrecord poison = newRecord();
int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1);
BlockingQueue<wikiparserrecord> in = new ArrayBlockingQueue<wikiparserrecord>(threads * 10);

@ -528,6 +528,13 @@ public class yacyURL implements Serializable {
return path.substring(p + 1); // the 'real' file name
}
public String getFileExtension() {
String name = getFileName();
int p = name.lastIndexOf('.');
if (p < 0) return "";
return name.substring(p + 1);
}
public String getPath() {
return path;
}

Loading…
Cancel
Save