diff --git a/defaults/yacy.init b/defaults/yacy.init index 6624a162e..791381c24 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -247,23 +247,9 @@ releases = DATA/RELEASE minimumLocalDelta = 0 minimumGlobalDelta = 500 -# the following mime-types are the whitelist for indexing -# -# parseableMime: specifies mime-types that can be indexed with any built-in parser -parseableMimeTypes=application/atom+xml,application/bzip2,application/excel,application/gzip,application/java-archive,application/msexcel,application/mspowerpoint,application/msword,application/octet-stream,application/pdf,application/postscript,application/powerpoint,application/rdf+xml,application/rss+xml,application/rtf,application/tar,application/vcard,application/visio,application/visio.drawing,application/vnd.ms-excel,application/vnd.ms-powerpoint,application/vnd.oasis.opendocument.text,application/vnd.visio,application/vsd,application/x-7z-compressed,application/x-bz2,application/x-bzip2,application/x-compress,application/x-compressed,application/x-excel,application/x-gzip,application/x-msexcel,application/x-redhat packet manager,application/x-redhat-package-manager,application/x-rpm,application/x-shockwave-flash,application/x-shockwave-flash2-preview,application/x-tar,application/x-visio,application/x-vnd.oasis.opendocument.text,application/x-vsd,application/x-xml,application/x-zip,application/x-zip-compressed,application/xml,application/zip,image/x-vsd,text/postscript,text/rss,text/rtf,text/x-vcard,text/xml,zz-application/zz-winassoc-vsd - -# parseableMimeTypes.IMAGE: specifies mime-types that refer to image type content -parseableMimeTypes.IMAGE=image/gif,image/jpeg,image/png,image/tiff,image/vnd.wap.wbmp,image/x-icon,image/bmp - -# parseableMimeTypes.HTML: specifies mime-types that can be indexed with built-in html parser -parseableMimeTypes.HTML=application/xhtml+xml,text/html,text/plain,text/sgml - -# media extension string -# a comma-separated list of extensions that denote media file formats -# this is important to recognize - tags as not-html reference -# These files will be excluded from indexing _(Please keep extensions in alphabetical order)_ -mediaExt=7z,ace,aif,aiff,arj,asf,asx,avi,bin,bmp,bz2,css,db,dcm,deb,doc,dll,dmg,exe,gif,gz,hqx,ico,img,iso,jar,jpe,jpg,jpeg,lx,lxl,m4v,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,scr,sit,so,swf,sxc,sxd,sxi,sxw,tar,tbz,tgz,torrent,vsd,war,wav,wmv,xcf,xls,zip -parseableExt=html,htm,txt,php,shtml,asp,aspx,jsp +# the following mime-types are a blacklist for indexing: +# parser.mime.deny: specifies mime-types that shall not be indexed +parser.mime.deny= # Promotion Strings # These strings appear in the Web Mask of the YACY search client diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 027f5a644..909ad984f 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -29,15 +29,13 @@ import java.net.InetSocketAddress; import java.net.SocketException; -import java.util.Arrays; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.StringTokenizer; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import de.anomic.document.Classification; +import de.anomic.document.Parser; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpd; @@ -46,6 +44,7 @@ import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Digest; import de.anomic.kelondro.util.DateFormatter; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaSwitchboardConstants; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -458,30 +457,16 @@ public class SettingsAck_p { if (post.containsKey("parserSettings")) { post.remove("parserSettings"); - final HashSet newConfig = new HashSet(); - // loop through all received settings final Iterator keyEnum = post.keySet().iterator(); while (keyEnum.hasNext()) { String key = keyEnum.next(); - if (key.startsWith("mimename")) newConfig.add(post.get(key)); + if (key.startsWith("mimename")) Parser.grantMime(key.substring(9), post.get(key).equals("on")); } - int enabledMimesCount = 0; - final StringBuilder currEnabledMimesTxt = new StringBuilder(); - final String[] enabledMimes = Classification.setEnabledParserList(newConfig); - Arrays.sort(enabledMimes); - - currEnabledMimesTxt.setLength(0); - for (int i=0; i < enabledMimes.length; i++) { - currEnabledMimesTxt.append(enabledMimes[i]).append(","); - prop.put("info_parser_" + enabledMimesCount + "_enabledMime", newConfig.toString()); - enabledMimesCount++; - } - if (currEnabledMimesTxt.length() > 0) currEnabledMimesTxt.deleteCharAt(currEnabledMimesTxt.length()-1); - env.setConfig("parseableMimeTypes", currEnabledMimesTxt.toString()); + env.setConfig(plasmaSwitchboardConstants.PARSER_MIME_DENY, Parser.getDenyMime()); - prop.put("info_parser",enabledMimesCount); + prop.put("info_parser", 0); prop.put("info", "18"); return prop; diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java index 40857bf38..ccfb51925 100644 --- a/htroot/Settings_p.java +++ b/htroot/Settings_p.java @@ -24,11 +24,9 @@ // javac -classpath .:../Classes Settings_p.java // if the shell's current path is HTROOT -import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; -import de.anomic.document.Classification; import de.anomic.document.Idiom; import de.anomic.document.Parser; import de.anomic.http.httpHeader; @@ -219,17 +217,15 @@ public final class Settings_p { */ int parserIdx = 0; - final Iterator availableParserIter = Parser.availableParserList.values().iterator(); + final Iterator availableParserIter = Parser.idioms().iterator(); while (availableParserIter.hasNext()) { final Idiom parserInfo = availableParserIter.next(); prop.put("parser_" + parserIdx + "_name", parserInfo.getName()); int mimeIdx = 0; - final Enumeration mimeTypeIter = parserInfo.getSupportedMimeTypes().keys(); - while (mimeTypeIter.hasMoreElements()) { - final String mimeType = mimeTypeIter.nextElement(); + for (String mimeType: parserInfo.getSupportedMimeTypes().keySet()) { prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType); - prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (Classification.supportedMimeTypesContains(mimeType)) ? 1 : 0); + prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (Parser.supportsMime(mimeType)) ? 1 : 0); mimeIdx++; } prop.put("parser_" + parserIdx + "_mime", mimeIdx); diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/FTPLoader.java index d32d4d663..f501c247b 100644 --- a/source/de/anomic/crawler/FTPLoader.java +++ b/source/de/anomic/crawler/FTPLoader.java @@ -32,7 +32,7 @@ import java.io.IOException; import java.io.PrintStream; import java.util.Date; -import de.anomic.document.Classification; +import de.anomic.document.Parser; import de.anomic.http.httpHeader; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; @@ -218,15 +218,24 @@ public class FTPLoader { private httpDocument getFile(final ftpc ftpClient, final CrawlEntry entry) throws Exception { // determine the mimetype of the resource final yacyURL entryUrl = entry.url(); - final String extension = Classification.getFileExt(entryUrl); - final String mimeType = Classification.getMimeTypeByFileExt(extension); + final String mimeType = Parser.mimeOf(entryUrl); final String path = getPath(entryUrl); // if the mimetype and file extension is supported we start to download // the file httpDocument htCache = null; - if (Classification.supportedContent(entryUrl, mimeType)) { - // aborting download if content is too long + if (!Parser.supportsExtension(entryUrl)) { + // if the response has not the right file type then reject file + log.logInfo("REJECTED WRONG EXTENSION TYPE " + mimeType + " for URL " + entry.url().toString()); + sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong extension"); + throw new Exception("response has not the right extension type -> rejected"); + } else if (!Parser.supportsMime(mimeType)) { + // if the response has not the right file type then reject file + log.logInfo("REJECTED WRONG MIME TYPE " + mimeType + " for URL " + entry.url().toString()); + sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong mime type"); + throw new Exception("response has not the right mime type -> rejected"); + } else { + // abort the download if content is too long final int size = ftpClient.fileSize(path); if (size <= maxFileSize || maxFileSize == -1) { // timeout for download @@ -246,11 +255,6 @@ public class FTPLoader { sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded"); throw new Exception("file size exceeds limit"); } - } else { - // if the response has not the right file type then reject file - log.logInfo("REJECTED WRONG MIME/EXT TYPE " + mimeType + " for URL " + entry.url().toString()); - sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension"); - throw new Exception("response has not the right file type -> rejected"); } return htCache; } diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index 55dbd60d4..04dbe3431 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -29,7 +29,7 @@ import java.io.IOException; import java.util.Date; import de.anomic.data.Blacklist; -import de.anomic.document.Classification; +import de.anomic.document.Parser; import de.anomic.http.httpClient; import de.anomic.http.httpHeader; import de.anomic.http.httpResponse; @@ -156,8 +156,15 @@ public final class HTTPLoader { // request has been placed and result has been returned. work off response //try { - if (Classification.supportedContent(entry.url(), res.getResponseHeader().mime())) { - + if (!Parser.supportsExtension(entry.url())) { + // if the response has not the right file type then reject file + sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong extension"); + throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString()); + } else if (!Parser.supportsMime(res.getResponseHeader().mime())) { + // if the response has not the right file type then reject file + sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type"); + throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); + } else { // get the content length and check if the length is allowed long contentLength = res.getResponseHeader().getContentLength(); if (maxFileSize >= 0 && contentLength > maxFileSize) { @@ -177,10 +184,6 @@ public final class HTTPLoader { } htCache.setCacheArray(responseBody); - } else { - // if the response has not the right file type then reject file - sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type or wrong extension"); - throw new IOException("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); } return htCache; /* diff --git a/source/de/anomic/document/AbstractParser.java b/source/de/anomic/document/AbstractParser.java index cf79945e9..4276b6ec0 100644 --- a/source/de/anomic/document/AbstractParser.java +++ b/source/de/anomic/document/AbstractParser.java @@ -54,7 +54,7 @@ public abstract class AbstractParser implements Idiom { /** * Parser name */ - protected String parserName = this.getClass().getName(); + private String parserName; /** * The source file file size in bytes if the source document was passed @@ -65,7 +65,7 @@ public abstract class AbstractParser implements Idiom { /** * The Constructor of this class. */ - public AbstractParser() { + public AbstractParser(String name) { super(); } @@ -125,10 +125,7 @@ public abstract class AbstractParser implements Idiom { // XXX: workaround for relative paths within document + file.getPath().substring(file.getPath().indexOf(File.separatorChar) + 1) + "/" + file.getName()); - final Document subdoc = Parser.parseSource( - url, - Classification.getMimeTypeByFileExt(files[i].substring(files[i].indexOf('.') + 1)), - null, file); + final Document subdoc = Parser.parseSource(url, Parser.mimeOf(url), null, file); // TODO: change anchors back to use '#' after archive name doc.addSubDocument(subdoc); subdoc.close(); diff --git a/source/de/anomic/document/Classification.java b/source/de/anomic/document/Classification.java index 59c662490..7ea9aeb51 100644 --- a/source/de/anomic/document/Classification.java +++ b/source/de/anomic/document/Classification.java @@ -2,8 +2,6 @@ // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 09.07.2009 on http://yacy.net // -// This is a part of YaCy, a peer-to-peer based web search engine -// // $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $ // $LastChangedRevision: 5736 $ // $LastChangedBy: borg-0300 $ @@ -30,38 +28,25 @@ import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.util.Arrays; import java.util.HashSet; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; import java.util.Properties; import java.util.Set; -import de.anomic.yacy.yacyURL; -import de.anomic.yacy.logging.Log; - public class Classification { - public static final HashSet supportedHTMLFileExt = new HashSet(); - public static final HashSet supportedHTMLMimeTypes = new HashSet(); - private static final HashSet mediaExtSet = new HashSet(); private static final HashSet imageExtSet = new HashSet(); private static final HashSet audioExtSet = new HashSet(); private static final HashSet videoExtSet = new HashSet(); private static final HashSet appsExtSet = new HashSet(); - private static final Properties mimeTypeLookupByFileExt = new Properties(); - public final static HashSet enabledParserList = new HashSet(); - private final static HashSet supportedFileExt = new HashSet(); + private static final Properties ext2mime = new Properties(); static { // load a list of extensions from file BufferedInputStream bufferedIn = null; try { - mimeTypeLookupByFileExt.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime")))); + ext2mime.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime")))); } catch (final IOException e) { System.err.println("ERROR: httpd.mime not found in settings path"); } finally { @@ -70,219 +55,46 @@ public class Classification { } catch (final Exception e) {} } - final String apps = "sit,hqx,img,dmg,exe,com,bat,sh,vbs,zip,jar"; - final String audio = "mp2,mp3,ogg,aac,aif,aiff,wav"; - final String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v"; - final String image = "jpg,jpeg,jpe,gif,png,ico,bmp"; + final String apps = "7z,ace,arc,arj,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip"; + final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,wav,wma"; + final String video = "3g2,3gp,3gp2,3gpp,3gpp2,3ivx,asf,asx,avi,div,divx,dv,dvx,env,f4v,flv,hdmov,m1v,m4v,m-jpeg,moov,mov,movie,mp2v,mp4,mpe,mpeg,mpg,mpg4,mv4,ogm,ogv,qt,rm,rv,vid,swf,wmv"; + final String image = "ai,bmp,cdr,cmx,emf,eps,gif,img,jpeg,jpg,mng,pct,pdd,pdn,pict,png,psb,psd,psp,tif,tiff,wmf"; - imageExtSet.addAll(extString2extList(image)); // image formats - audioExtSet.addAll(extString2extList(audio)); // audio formats - videoExtSet.addAll(extString2extList(video)); // video formats - appsExtSet.addAll(extString2extList(apps)); // application formats - - initMediaExt(extString2extList(apps + "," + // application container - "tar,gz,bz2,arj,zip,rar," + // archive formats - "ps,xls,ppt,asf," + // text formats without support - audio + "," + // audio formats - video + "," + // video formats - image // image formats - )); + addSet(imageExtSet, image); // image formats + addSet(audioExtSet, audio); // audio formats + addSet(videoExtSet, video); // video formats + addSet(appsExtSet, apps); // application formats + addSet(mediaExtSet, apps + "," + audio + "," + video + "," + image); // all media formats } - public static List extString2extList(final String extString) { - final LinkedList extensions = new LinkedList(); - if ((extString == null) || (extString.length() == 0)) { - return extensions; - } - final String[] xs = extString.split(","); - for (int i = 0; i < xs.length; i++) - extensions.add(xs[i].toLowerCase().trim()); - return extensions; + private static void addSet(Set set, final String extString) { + if ((extString == null) || (extString.length() == 0)) return; + for (String s: extString.split(",")) set.add(s.toLowerCase().trim()); } - public static void initMediaExt(final List mediaExtList) { - mediaExtSet.addAll(mediaExtList); - } - - public static boolean mediaExtContains(String mediaExt) { + public static boolean isMediaExtension(String mediaExt) { if (mediaExt == null) return false; - mediaExt = mediaExt.trim().toLowerCase(); - - if (supportedHTMLFileExt.contains(mediaExt)) return false; - - if (supportedFileExtContains(mediaExt)) return false; - - return mediaExtSet.contains(mediaExt); + return mediaExtSet.contains(mediaExt.trim().toLowerCase()); } - public static boolean imageExtContains(final String imageExt) { + public static boolean isImageExtension(final String imageExt) { if (imageExt == null) return false; return imageExtSet.contains(imageExt.trim().toLowerCase()); } - public static boolean audioExtContains(final String audioExt) { + public static boolean isAudioExtension(final String audioExt) { if (audioExt == null) return false; return audioExtSet.contains(audioExt.trim().toLowerCase()); } - public static boolean videoExtContains(final String videoExt) { + public static boolean isVideoExtension(final String videoExt) { if (videoExt == null) return false; return videoExtSet.contains(videoExt.trim().toLowerCase()); } - public static boolean appsExtContains(final String appsExt) { + public static boolean isApplicationExtension(final String appsExt) { if (appsExt == null) return false; return appsExtSet.contains(appsExt.trim().toLowerCase()); } - public static void initHTMLParsableMimeTypes( - final String htmlParsableMimeTypes) { - final LinkedList mimeTypes = new LinkedList(); - if ((htmlParsableMimeTypes == null) || (htmlParsableMimeTypes.length() == 0)) { - return; - } - final String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes - .split(","); - for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) { - mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim()); - } - supportedHTMLMimeTypes.addAll(mimeTypes); - } - - public static String normalizeMimeType(String mimeType) { - // if (mimeType == null) doMimeTypeAnalysis - if (mimeType == null) mimeType = "application/octet-stream"; - mimeType = mimeType.trim().toLowerCase(); - - final int pos = mimeType.indexOf(';'); - return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); - } - - public static String getMimeTypeByFileExt(final String fileExt) { - return mimeTypeLookupByFileExt.getProperty(fileExt, "application/octet-stream"); - } - - public static void initSupportedHTMLFileExt(final List supportedRealtimeFileExtList) { - supportedHTMLFileExt.addAll(supportedRealtimeFileExtList); - } - - static boolean HTMLParsableMimeTypesContains(String mimeType) { - mimeType = normalizeMimeType(mimeType); - return supportedHTMLMimeTypes.contains(mimeType); - } - - public static boolean supportedContent(final yacyURL url, String mimeType) { - mimeType = Classification.normalizeMimeType(mimeType); - if ( - mimeType.equals("text/html") || - mimeType.equals("application/xhtml+xml") || - mimeType.equals("text/plain") - ) { - return supportedMimeTypesContains(mimeType); - } - return supportedMimeTypesContains(mimeType) && supportedFileExt(url); - } - - public static boolean supportedMimeTypesContains(String mimeType) { - mimeType = Classification.normalizeMimeType(mimeType); - - if (Classification.supportedHTMLMimeTypes.contains(mimeType)) return true; - return enabledParserList.contains(mimeType); - } - - private static boolean supportedFileExt(final yacyURL url) { - if (url == null) throw new NullPointerException(); - - // getting the file path - final String name = getFileExt(url); - return supportedFileExtContains(name); - } - - public static boolean supportedFileExtContains(String fileExt) { - if (fileExt == null) return false; - fileExt = fileExt.trim().toLowerCase(); - if (Classification.supportedHTMLFileExt.contains(fileExt)) return true; - - return supportedFileExt.contains(fileExt); - } - - public static void addParseableMimeTypes(final String enabledMimeTypes) { - HashSet mimeTypes = null; - if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) { - mimeTypes = new HashSet(); - } else { - final String[] enabledMimeTypeList = enabledMimeTypes.split(","); - mimeTypes = new HashSet(enabledMimeTypeList.length); - for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim()); - } - setEnabledParserList(mimeTypes); - } - - public static void enableAllParsers() { - final Set availableMimeTypes = Parser.availableParserList.keySet(); - setEnabledParserList(availableMimeTypes); - } - - public static String[] setEnabledParserList(final Set mimeTypeSet) { - - final HashSet newEnabledParsers = new HashSet(); - final HashSet newSupportedFileExt = new HashSet(); - - if (mimeTypeSet != null) { - final Iterator mimeTypes = mimeTypeSet.iterator(); - while (mimeTypes.hasNext()) { - final String mimeType = mimeTypes.next(); - Idiom theParser = Parser.availableParserList.get(mimeType); - if (theParser != null) { - try { - // getting a list of mimeTypes that the parser supports - final Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes(); - if (parserSupportsMimeTypes != null) { - final Object supportedExtensions = parserSupportsMimeTypes.get(mimeType); - if ((supportedExtensions != null) && - (supportedExtensions instanceof String) && - (((String)supportedExtensions).length() > 0)) { - final String[] extArray = ((String)supportedExtensions).split(","); - newSupportedFileExt.addAll(Arrays.asList(extArray)); - } - } - newEnabledParsers.add(mimeType); - - } catch (final Exception e) { - Log.logSevere("PARSER", "error in setEnabledParserList", e); - } finally { - if (theParser != null) - theParser = null; // destroy object - } - } - } - } - - enabledParserList.addAll(newEnabledParsers); - supportedFileExt.addAll(newSupportedFileExt); - - return newEnabledParsers.toArray(new String[newEnabledParsers.size()]); - } - - @SuppressWarnings("unchecked") - public static HashSet getEnabledParserList() { - return (HashSet) enabledParserList.clone(); - } - - public static String getFileExt(final yacyURL url) { - // getting the file path - String name = url.getPath(); - - // tetermining last position of / in the file path - int p = name.lastIndexOf('/'); - if (p != -1) { - name = name.substring(p); - } - - // termining last position of . in file path - p = name.lastIndexOf('.'); - if (p < 0) - return ""; - return name.substring(p + 1); - } } diff --git a/source/de/anomic/document/Document.java b/source/de/anomic/document/Document.java index f991befaf..d6a3d144c 100644 --- a/source/de/anomic/document/Document.java +++ b/source/de/anomic/document/Document.java @@ -374,14 +374,14 @@ dc_rights } else { ext = u.substring(extpos + 1).toLowerCase(); } - if (Classification.mediaExtContains(ext)) { + if (Classification.isMediaExtension(ext)) { // this is not a normal anchor, its a media link - if (Classification.imageExtContains(ext)) { + if (Classification.isImageExtension(ext)) { ContentScraper.addImage(collectedImages, new ImageEntry(url, entry.getValue(), -1, -1)); } - else if (Classification.audioExtContains(ext)) audiolinks.put(url, entry.getValue()); - else if (Classification.videoExtContains(ext)) videolinks.put(url, entry.getValue()); - else if (Classification.appsExtContains(ext)) applinks.put(url, entry.getValue()); + else if (Classification.isAudioExtension(ext)) audiolinks.put(url, entry.getValue()); + else if (Classification.isVideoExtension(ext)) videolinks.put(url, entry.getValue()); + else if (Classification.isApplicationExtension(ext)) applinks.put(url, entry.getValue()); } else { hyperlinks.put(url, entry.getValue()); } diff --git a/source/de/anomic/document/Idiom.java b/source/de/anomic/document/Idiom.java index 64cd92617..5ab8405ee 100644 --- a/source/de/anomic/document/Idiom.java +++ b/source/de/anomic/document/Idiom.java @@ -27,6 +27,7 @@ package de.anomic.document; import java.io.File; import java.io.InputStream; +import java.util.HashMap; import java.util.Hashtable; import de.anomic.yacy.yacyURL; @@ -85,11 +86,12 @@ public interface Idiom { throws ParserException, InterruptedException; /** - * Can be used to determine the MimeType(s) that are supported by the parser - * @return a {@link Hashtable} containing a list of MimeTypes that are supported by - * the parser + * Get the MimeType(s) that are supported by the parser + * @return a {@link Hashtable} containing a mapping from a mime type string + * to a comma-separated String of file extensions + * that are supported by the idiom parser */ - public Hashtable getSupportedMimeTypes(); + public HashMap getSupportedMimeTypes(); /** * This function should be called before reusing the parser object. diff --git a/source/de/anomic/document/Parser.java b/source/de/anomic/document/Parser.java index 579a05a9e..5baf9eecf 100644 --- a/source/de/anomic/document/Parser.java +++ b/source/de/anomic/document/Parser.java @@ -31,9 +31,13 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; -import java.util.HashMap; -import java.util.Hashtable; -import java.util.Iterator; +import java.text.Collator; +import java.util.HashSet; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; import de.anomic.document.parser.bzipParser; import de.anomic.document.parser.docParser; @@ -59,13 +63,24 @@ import de.anomic.yacy.logging.Log; public final class Parser { - private static final Log theLogger = new Log("PARSER"); - public static final HashMap availableParserList = new HashMap(); + private static final Log log = new Log("PARSER"); + + // use a collator to relax when distinguishing between lowercase und uppercase letters + private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); + static { + insensitiveCollator.setStrength(Collator.SECONDARY); + insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION); + } + + private static final Map mime2parser = new TreeMap(insensitiveCollator); + private static final Map> ext2mime = new TreeMap>(insensitiveCollator); + private static final Set denyMime = new TreeSet(insensitiveCollator); static { initParser(new bzipParser()); initParser(new docParser()); initParser(new gzipParser()); + initParser(new htmlParser()); initParser(new mimeTypeParser()); initParser(new odtParser()); initParser(new pdfParser()); @@ -82,14 +97,30 @@ public final class Parser { initParser(new xlsParser()); initParser(new zipParser()); } + + public static Set idioms() { + Set c = new HashSet(); + c.addAll(mime2parser.values()); + return c; + } + + private static void initParser(Idiom parser) { + for (Map.Entry e: parser.getSupportedMimeTypes().entrySet()) { + // process the mime types + final String mimeType = e.getKey(); + Idiom p0 = mime2parser.get(mimeType); + if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser."); + mime2parser.put(mimeType, parser); + Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName()); - private static void initParser(Idiom theParser) { - final Hashtable supportedMimeTypes = theParser.getSupportedMimeTypes(); - final Iterator mimeTypeIterator = supportedMimeTypes.keySet().iterator(); - while (mimeTypeIterator.hasNext()) { - final String mimeType = mimeTypeIterator.next(); - availableParserList.put(mimeType, theParser); - Log.logInfo("PARSER", "Found parser for mimeType '" + mimeType + "': " + theParser.getName()); + // process the extensions + String[] exts = e.getValue().split(","); + for (String ext: exts) { + Set s = ext2mime.get(ext); + if (s == null) s = new HashSet(); + s.add(mimeType); + ext2mime.put(ext, s); + } } } @@ -99,10 +130,10 @@ public final class Parser { ParserException { ByteArrayInputStream byteIn = null; try { - if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from byte-array"); + if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array"); if (sourceArray == null || sourceArray.length == 0) { final String errorMsg = "No resource content available (1) " + (((sourceArray == null) ? "source == null" : "source.length() == 0") + ", url = " + location.toNormalform(true, false)); - theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + log.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg, location, errorMsg); } byteIn = new ByteArrayInputStream(sourceArray); @@ -110,7 +141,7 @@ public final class Parser { } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; - theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e); + log.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e); throw new ParserException("Unexpected exception while parsing " + location, location, e); } finally { if (byteIn != null) try { @@ -125,10 +156,10 @@ public final class Parser { BufferedInputStream sourceStream = null; try { - if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from file"); + if (log.isFine()) log.logFine("Parsing '" + location + "' from file"); if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2)."; - theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + log.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg, location, "document has no content"); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); @@ -136,7 +167,7 @@ public final class Parser { } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; - theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e); + log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e); throw new ParserException("Unexpected exception while parsing " + location, location, e); } finally { if (sourceStream != null)try { @@ -150,31 +181,34 @@ public final class Parser { final long contentLength, final InputStream sourceStream) throws InterruptedException, ParserException { try { - if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from stream"); - mimeType = Classification.normalizeMimeType(mimeType); - final String fileExt = Classification.getFileExt(location); + if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); + mimeType = normalizeMimeType(mimeType); + final String fileExt = location.getFileExtension(); final String documentCharset = htmlParser.patchCharsetEncoding(charset); - if (!Classification.supportedContent(location, mimeType)) { - final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (1)"; - theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location, "wrong mime type or wrong extension"); + if (!supportsMime(mimeType)) { + final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; + log.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg, location, "wrong mime type"); + } + if (!supportsExtension(location)) { + final String errorMsg = "No parser available to parse extension of url path"; + log.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg, location, "wrong extension"); } - if (theLogger.isFine()) theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); - Idiom parser = availableParserList.get(Classification.normalizeMimeType(mimeType)); + if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); + Idiom parser = mime2parser.get(normalizeMimeType(mimeType)); Document doc = null; if (parser != null) { parser.setContentLength(contentLength); doc = parser.parse(location, mimeType, documentCharset, sourceStream); - } else if (Classification.HTMLParsableMimeTypesContains(mimeType)) { - doc = new htmlParser().parse(location, mimeType, documentCharset, sourceStream); } else { final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)"; - theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + log.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg, location, "wrong mime type or wrong extension"); } if (doc == null) { final String errorMsg = "Unexpected error. Parser returned null."; - theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + log.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg, location); } return doc; @@ -182,9 +216,50 @@ public final class Parser { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; final String errorMsg = "Unexpected exception. " + e.getMessage(); - theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e); + log.logSevere("Unable to parse '" + location + "'. " + errorMsg, e); throw new ParserException(errorMsg, location, e); } } + public static boolean supportsMime(String mimeType) { + return !denyMime.contains(mimeType) && mime2parser.containsKey(normalizeMimeType(mimeType)); + } + + public static boolean supportsExtension(final yacyURL url) { + String ext = url.getFileExtension(); + if (ext.length() == 0) return true; // may be anything; thats ok if the mime type is ok + return ext2mime.containsKey(ext); + } + + public static String mimeOf(yacyURL url) { + return mimeOf(url.getFileExtension()); + } + + public static String mimeOf(String ext) { + Set mimes = ext2mime.get(ext); + if (mimes == null) return null; + return mimes.iterator().next(); + } + + private static String normalizeMimeType(String mimeType) { + if (mimeType == null) return "application/octet-stream"; + final int pos = mimeType.indexOf(';'); + return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim()); + } + + public static void setDenyMime(String denyList) { + denyMime.clear(); + for (String s: denyList.split(",")) denyMime.add(s); + } + + public static String getDenyMime() { + String s = ""; + for (String d: denyMime) s += d + ","; + s = s.substring(0, s.length() - 1); + return s; + } + + public static void grantMime(String mime, boolean grant) { + if (grant) denyMime.remove(mime); else denyMime.add(mime); + } } diff --git a/source/de/anomic/document/parser/bzipParser.java b/source/de/anomic/document/parser/bzipParser.java index 8173e80be..2b5321cc0 100644 --- a/source/de/anomic/document/parser/bzipParser.java +++ b/source/de/anomic/document/parser/bzipParser.java @@ -30,8 +30,7 @@ package de.anomic.document.parser; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; -import java.util.Hashtable; - +import java.util.HashMap; import org.apache.tools.bzip2.CBZip2InputStream; import de.anomic.document.AbstractParser; @@ -48,7 +47,7 @@ public class bzipParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static final String fileExtensions = "bz2,tbz,tbz2"; static { SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions); @@ -61,11 +60,10 @@ public class bzipParser extends AbstractParser implements Idiom { } public bzipParser() { - super(); - this.parserName = "Bzip 2 UNIX Compressed File Parser"; + super("Bzip 2 UNIX Compressed File Parser"); } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/docParser.java b/source/de/anomic/document/parser/docParser.java index ce4db99af..41b47d136 100644 --- a/source/de/anomic/document/parser/docParser.java +++ b/source/de/anomic/document/parser/docParser.java @@ -28,8 +28,7 @@ package de.anomic.document.parser; import java.io.InputStream; -import java.util.Hashtable; - +import java.util.HashMap; import org.textmining.extraction.TextExtractor; import org.textmining.extraction.word.WordTextExtractorFactory; @@ -45,22 +44,22 @@ public class docParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { - SUPPORTED_MIME_TYPES.put("application/msword","doc"); - SUPPORTED_MIME_TYPES.put("application/doc","doc"); - SUPPORTED_MIME_TYPES.put("appl/text","doc"); - SUPPORTED_MIME_TYPES.put("application/vnd.msword","doc"); - SUPPORTED_MIME_TYPES.put("application/vnd.ms-word","doc"); - SUPPORTED_MIME_TYPES.put("application/winword","doc"); - SUPPORTED_MIME_TYPES.put("application/word","doc"); - SUPPORTED_MIME_TYPES.put("application/x-msw6","doc"); - SUPPORTED_MIME_TYPES.put("application/x-msword","doc"); + String ext = "doc,docx"; + SUPPORTED_MIME_TYPES.put("application/msword",ext); + SUPPORTED_MIME_TYPES.put("application/doc",ext); + SUPPORTED_MIME_TYPES.put("appl/text",ext); + SUPPORTED_MIME_TYPES.put("application/vnd.msword",ext); + SUPPORTED_MIME_TYPES.put("application/vnd.ms-word",ext); + SUPPORTED_MIME_TYPES.put("application/winword",ext); + SUPPORTED_MIME_TYPES.put("application/word",ext); + SUPPORTED_MIME_TYPES.put("application/x-msw6",ext); + SUPPORTED_MIME_TYPES.put("application/x-msword",ext); } public docParser() { - super(); - this.parserName = "Word Document Parser"; + super("Word Document Parser"); } public Document parse(final yacyURL location, final String mimeType, final String charset, @@ -103,7 +102,7 @@ public class docParser extends AbstractParser implements Idiom { } } - public java.util.Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return docParser.SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/gzipParser.java b/source/de/anomic/document/parser/gzipParser.java index 408bbff98..78aa2f491 100644 --- a/source/de/anomic/document/parser/gzipParser.java +++ b/source/de/anomic/document/parser/gzipParser.java @@ -30,7 +30,7 @@ package de.anomic.document.parser; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; -import java.util.Hashtable; +import java.util.HashMap; import java.util.zip.GZIPInputStream; import de.anomic.document.AbstractParser; @@ -47,27 +47,26 @@ public class gzipParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); - static final String fileExtensions = "gz,tgz"; + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); + static final String ext = "gz,tgz"; static { - SUPPORTED_MIME_TYPES.put("application/x-gzip",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/gzip",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-gunzip",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/gzipped",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/gzip-compressed",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-compressed",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-compress",fileExtensions); - SUPPORTED_MIME_TYPES.put("gzip/document",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/octet-stream",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-tar",fileExtensions); + SUPPORTED_MIME_TYPES.put("application/x-gzip",ext); + SUPPORTED_MIME_TYPES.put("application/gzip",ext); + SUPPORTED_MIME_TYPES.put("application/x-gunzip",ext); + SUPPORTED_MIME_TYPES.put("application/gzipped",ext); + SUPPORTED_MIME_TYPES.put("application/gzip-compressed",ext); + SUPPORTED_MIME_TYPES.put("application/x-compressed",ext); + SUPPORTED_MIME_TYPES.put("application/x-compress",ext); + SUPPORTED_MIME_TYPES.put("gzip/document",ext); + SUPPORTED_MIME_TYPES.put("application/octet-stream",ext); + SUPPORTED_MIME_TYPES.put("application/x-tar",ext); } public gzipParser() { - super(); - this.parserName = "GNU Zip Compressed Archive Parser"; + super("GNU Zip Compressed Archive Parser"); } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/htmlParser.java b/source/de/anomic/document/parser/htmlParser.java index 2d83d09b6..743226dda 100644 --- a/source/de/anomic/document/parser/htmlParser.java +++ b/source/de/anomic/document/parser/htmlParser.java @@ -31,8 +31,7 @@ import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; -import java.util.Hashtable; - +import java.util.HashMap; import de.anomic.document.AbstractParser; import de.anomic.document.Document; import de.anomic.document.Idiom; @@ -49,17 +48,17 @@ public class htmlParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); - static { - SUPPORTED_MIME_TYPES.put("application/xhtml+xml","htm,html,xhtml,php,asp"); - SUPPORTED_MIME_TYPES.put("text/html","htm,html,xhtml,php,asp"); - SUPPORTED_MIME_TYPES.put("text/plain","htm,html,xhtml,php,asp,txt"); - SUPPORTED_MIME_TYPES.put("text/sgml","htm,html,xhtml,php,asp,xml"); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); + static { + String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp"; + SUPPORTED_MIME_TYPES.put("application/xhtml+xml", ext); + SUPPORTED_MIME_TYPES.put("text/html", ext); + SUPPORTED_MIME_TYPES.put("text/plain", ext); + SUPPORTED_MIME_TYPES.put("text/sgml",ext); } public htmlParser() { - super(); - this.parserName = "streaming html parser"; + super("streaming html parser"); } @Override @@ -215,7 +214,7 @@ public class htmlParser extends AbstractParser implements Idiom { } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/mimeTypeParser.java b/source/de/anomic/document/parser/mimeTypeParser.java index d36d72825..c4c568e17 100644 --- a/source/de/anomic/document/parser/mimeTypeParser.java +++ b/source/de/anomic/document/parser/mimeTypeParser.java @@ -31,6 +31,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.Collection; +import java.util.HashMap; import java.util.Hashtable; import net.sf.jmimemagic.Magic; @@ -54,14 +55,14 @@ public class mimeTypeParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("text/xml","xml"); SUPPORTED_MIME_TYPES.put("application/xml","xml"); - SUPPORTED_MIME_TYPES.put("application/x-xml","xml"); - SUPPORTED_MIME_TYPES.put("application/octet-stream",""); - SUPPORTED_MIME_TYPES.put("application/x-compress",""); - SUPPORTED_MIME_TYPES.put("application/x-compressed",""); + SUPPORTED_MIME_TYPES.put("application/x-xml","xml"); + SUPPORTED_MIME_TYPES.put("application/octet-stream","xml"); + SUPPORTED_MIME_TYPES.put("application/x-compress","xml"); + SUPPORTED_MIME_TYPES.put("application/x-compressed","xml"); } /** @@ -71,8 +72,7 @@ public class mimeTypeParser extends AbstractParser implements Idiom { private static Hashtable threadLoopDetection = new Hashtable(); public mimeTypeParser() { - super(); - this.parserName = "MimeType Parser"; + super("MimeType Parser"); } @SuppressWarnings("unchecked") @@ -174,7 +174,7 @@ public class mimeTypeParser extends AbstractParser implements Idiom { } - public java.util.Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return mimeTypeParser.SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/odtParser.java b/source/de/anomic/document/parser/odtParser.java index 0f43be1a9..c7119289c 100644 --- a/source/de/anomic/document/parser/odtParser.java +++ b/source/de/anomic/document/parser/odtParser.java @@ -35,8 +35,8 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.util.Enumeration; +import java.util.HashMap; import java.util.HashSet; -import java.util.Hashtable; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; @@ -63,18 +63,17 @@ public class odtParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt"); SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt"); } public odtParser() { - super(); - this.parserName = "OASIS OpenDocument V2 Text Document Parser"; + super("OASIS OpenDocument V2 Text Document Parser"); } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/pdfParser.java b/source/de/anomic/document/parser/pdfParser.java index af76bd98a..78a5a589f 100644 --- a/source/de/anomic/document/parser/pdfParser.java +++ b/source/de/anomic/document/parser/pdfParser.java @@ -33,8 +33,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.Writer; -import java.util.Hashtable; - +import java.util.HashMap; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocumentInformation; @@ -56,7 +55,7 @@ public class pdfParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("application/pdf","pdf"); SUPPORTED_MIME_TYPES.put("application/x-pdf","pdf"); @@ -67,11 +66,10 @@ public class pdfParser extends AbstractParser implements Idiom { } public pdfParser() { - super(); - this.parserName = "Acrobat Portable Document Parser"; + super("Acrobat Portable Document Parser"); } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/pptParser.java b/source/de/anomic/document/parser/pptParser.java index 16676329f..3729182ed 100644 --- a/source/de/anomic/document/parser/pptParser.java +++ b/source/de/anomic/document/parser/pptParser.java @@ -29,8 +29,7 @@ package de.anomic.document.parser; import java.io.BufferedInputStream; import java.io.InputStream; -import java.util.Hashtable; - +import java.util.HashMap; import org.apache.poi.hslf.extractor.PowerPointExtractor; import de.anomic.document.AbstractParser; @@ -45,22 +44,21 @@ public class pptParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); - static final String fileExtensions = "ppt,pps"; + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); + static final String ext = "ppt,pps"; static { - SUPPORTED_MIME_TYPES.put("application/mspowerpoint",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/powerpoint",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/ms-powerpoint",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/mspowerpnt",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/vnd-mspowerpoint",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-powerpoint",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-m",fileExtensions); + SUPPORTED_MIME_TYPES.put("application/mspowerpoint",ext); + SUPPORTED_MIME_TYPES.put("application/powerpoint",ext); + SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",ext); + SUPPORTED_MIME_TYPES.put("application/ms-powerpoint",ext); + SUPPORTED_MIME_TYPES.put("application/mspowerpnt",ext); + SUPPORTED_MIME_TYPES.put("application/vnd-mspowerpoint",ext); + SUPPORTED_MIME_TYPES.put("application/x-powerpoint",ext); + SUPPORTED_MIME_TYPES.put("application/x-m",ext); } public pptParser(){ - super(); - this.parserName = "Microsoft Powerpoint Parser"; + super("Microsoft Powerpoint Parser"); } /* @@ -116,7 +114,7 @@ public class pptParser extends AbstractParser implements Idiom { } } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/psParser.java b/source/de/anomic/document/parser/psParser.java index b7a60a405..be6674ce9 100644 --- a/source/de/anomic/document/parser/psParser.java +++ b/source/de/anomic/document/parser/psParser.java @@ -34,8 +34,7 @@ import java.io.FileReader; import java.io.FileWriter; import java.io.InputStream; import java.io.InputStreamReader; -import java.util.Hashtable; - +import java.util.HashMap; import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; import de.anomic.document.ParserException; @@ -49,7 +48,7 @@ public class psParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("application/ps","ps"); SUPPORTED_MIME_TYPES.put("application/x-postscript","ps"); @@ -62,8 +61,7 @@ public class psParser extends AbstractParser implements Idiom { private static String parserMode = "java"; public psParser() { - super(); - this.parserName = "PostScript Document Parser"; + super("PostScript Document Parser"); if (!modeScanDone) synchronized (modeScan) { if (testForPs2Ascii()) parserMode = "ps2ascii"; else parserMode = "java"; @@ -71,7 +69,7 @@ public class psParser extends AbstractParser implements Idiom { } } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/rpmParser.java b/source/de/anomic/document/parser/rpmParser.java index 79dba7936..452bc1572 100644 --- a/source/de/anomic/document/parser/rpmParser.java +++ b/source/de/anomic/document/parser/rpmParser.java @@ -31,8 +31,6 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.InputStream; import java.util.HashMap; -import java.util.Hashtable; - import com.jguild.jrpm.io.RPMFile; import com.jguild.jrpm.io.datatype.DataTypeIf; @@ -57,7 +55,7 @@ public class rpmParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("application/x-rpm","rpm"); SUPPORTED_MIME_TYPES.put("application/x-redhat packet manager","rpm"); @@ -65,11 +63,10 @@ public class rpmParser extends AbstractParser implements Idiom { } public rpmParser() { - super(); - this.parserName = "rpm Parser"; + super("rpm Parser"); } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/rssParser.java b/source/de/anomic/document/parser/rssParser.java index aadf35034..d893b6ca8 100644 --- a/source/de/anomic/document/parser/rssParser.java +++ b/source/de/anomic/document/parser/rssParser.java @@ -33,7 +33,6 @@ import java.io.InputStream; import java.io.Writer; import java.nio.charset.Charset; import java.util.HashMap; -import java.util.Hashtable; import java.util.LinkedList; import java.util.Map; @@ -59,7 +58,7 @@ public class rssParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static final String fileExtensions = "xml,rss,rdf"; static { SUPPORTED_MIME_TYPES.put("text/rss",fileExtensions); @@ -69,8 +68,7 @@ public class rssParser extends AbstractParser implements Idiom { } public rssParser() { - super(); - this.parserName = "Rich Site Summary/Atom Feed Parser"; + super("Rich Site Summary/Atom Feed Parser"); } public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { @@ -176,7 +174,7 @@ public class rssParser extends AbstractParser implements Idiom { } } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/rtfParser.java b/source/de/anomic/document/parser/rtfParser.java index 3a48ca4c3..30e70894c 100644 --- a/source/de/anomic/document/parser/rtfParser.java +++ b/source/de/anomic/document/parser/rtfParser.java @@ -28,8 +28,7 @@ package de.anomic.document.parser; import java.io.InputStream; -import java.util.Hashtable; - +import java.util.HashMap; import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; @@ -45,7 +44,7 @@ public class rtfParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("application/rtf","rtf"); SUPPORTED_MIME_TYPES.put("text/rtf","rtf"); @@ -57,8 +56,7 @@ public class rtfParser extends AbstractParser implements Idiom { } public rtfParser() { - super(); - this.parserName = "Rich Text Format Parser"; + super("Rich Text Format Parser"); } public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { @@ -100,7 +98,7 @@ public class rtfParser extends AbstractParser implements Idiom { } } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return rtfParser.SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/sevenzipParser.java b/source/de/anomic/document/parser/sevenzipParser.java index db2159ece..2d3fa7af4 100644 --- a/source/de/anomic/document/parser/sevenzipParser.java +++ b/source/de/anomic/document/parser/sevenzipParser.java @@ -32,8 +32,7 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.util.Hashtable; - +import java.util.HashMap; import SevenZip.ArchiveExtractCallback; import SevenZip.IInStream; import SevenZip.MyRandomAccessFile; @@ -41,7 +40,6 @@ import SevenZip.Archive.IInArchive; import SevenZip.Archive.SevenZipEntry; import SevenZip.Archive.SevenZip.Handler; import de.anomic.document.AbstractParser; -import de.anomic.document.Classification; import de.anomic.document.Idiom; import de.anomic.document.Parser; import de.anomic.document.ParserException; @@ -57,14 +55,13 @@ public class sevenzipParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z"); } public sevenzipParser() { - super(); - super.parserName = "7zip Archive Parser"; + super("7zip Archive Parser"); } public Document parse(final yacyURL location, final String mimeType, final String charset, @@ -127,7 +124,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { } } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @@ -190,7 +187,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects final yacyURL url = yacyURL.newURL(doc.dc_source(), this.prefix + "/" + super.filePath); - final String mime = Classification.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); + final String mime = Parser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); if (this.cfos.isFallback()) { theDoc = Parser.parseSource(url, mime, null, this.cfos.getContentFile()); } else { diff --git a/source/de/anomic/document/parser/swfParser.java b/source/de/anomic/document/parser/swfParser.java index d80bbdd0a..35caacab7 100644 --- a/source/de/anomic/document/parser/swfParser.java +++ b/source/de/anomic/document/parser/swfParser.java @@ -29,8 +29,6 @@ package de.anomic.document.parser; import java.io.InputStream; import java.util.HashMap; -import java.util.Hashtable; - import pt.tumba.parser.swf.SWF2HTML; import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; @@ -44,7 +42,7 @@ public class swfParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash","swf"); SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash2-preview","swf"); @@ -53,14 +51,13 @@ public class swfParser extends AbstractParser implements Idiom { } public swfParser() { - super(); - this.parserName = "Adobe Flash Parser"; + super("Adobe Flash Parser"); } /** * returns a hashtable containing the mimetypes that are supported by this class */ - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/tarParser.java b/source/de/anomic/document/parser/tarParser.java index 739d9e662..64113b8b7 100644 --- a/source/de/anomic/document/parser/tarParser.java +++ b/source/de/anomic/document/parser/tarParser.java @@ -34,7 +34,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; import java.util.HashMap; -import java.util.Hashtable; import java.util.LinkedList; import java.util.Map; import java.util.zip.GZIPInputStream; @@ -43,7 +42,6 @@ import com.ice.tar.TarEntry; import com.ice.tar.TarInputStream; import de.anomic.document.AbstractParser; -import de.anomic.document.Classification; import de.anomic.document.Idiom; import de.anomic.document.Parser; import de.anomic.document.ParserException; @@ -60,7 +58,7 @@ public class tarParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("application/x-tar","tar"); SUPPORTED_MIME_TYPES.put("application/tar","tar"); @@ -71,11 +69,10 @@ public class tarParser extends AbstractParser implements Idiom { } public tarParser() { - super(); - this.parserName = "Tape Archive File Parser"; + super("Tape Archive File Parser"); } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @@ -97,7 +94,7 @@ public class tarParser extends AbstractParser implements Idiom { * If the mimeType was not reported correcly by the webserve we * have to decompress it first */ - final String ext = Classification.getFileExt(location).toLowerCase(); + final String ext = location.getFileExtension().toLowerCase(); if (ext.equals("gz") || ext.equals("tgz")) { source = new GZIPInputStream(source); } @@ -130,7 +127,7 @@ public class tarParser extends AbstractParser implements Idiom { final String entryExt = (idx > -1) ? entryName.substring(idx+1) : ""; // trying to determine the mimeType per file extension - final String entryMime = Classification.getMimeTypeByFileExt(entryExt); + final String entryMime = Parser.mimeOf(entryExt); // getting the entry content File subDocTempFile = null; diff --git a/source/de/anomic/document/parser/vcfParser.java b/source/de/anomic/document/parser/vcfParser.java index b172ad523..f2ad16267 100644 --- a/source/de/anomic/document/parser/vcfParser.java +++ b/source/de/anomic/document/parser/vcfParser.java @@ -33,7 +33,6 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.util.HashMap; -import java.util.Hashtable; import java.util.Iterator; import java.util.LinkedList; @@ -61,7 +60,7 @@ public class vcfParser extends AbstractParser implements Idiom { * * TODO: support of x-mozilla-cpt and x-mozilla-html tags */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("text/x-vcard","vcf"); SUPPORTED_MIME_TYPES.put("application/vcard","vcf"); @@ -73,11 +72,10 @@ public class vcfParser extends AbstractParser implements Idiom { } public vcfParser() { - super(); - this.parserName = "vCard Parser"; + super("vCard Parser"); } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/vsdParser.java b/source/de/anomic/document/parser/vsdParser.java index a3eb94fd1..0bea160cc 100644 --- a/source/de/anomic/document/parser/vsdParser.java +++ b/source/de/anomic/document/parser/vsdParser.java @@ -28,8 +28,7 @@ package de.anomic.document.parser; import java.io.InputStream; -import java.util.Hashtable; - +import java.util.HashMap; import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; import de.anomic.document.ParserException; @@ -44,7 +43,7 @@ public class vsdParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("application/visio","vsd"); SUPPORTED_MIME_TYPES.put("application/x-visio","vsd"); @@ -57,14 +56,13 @@ public class vsdParser extends AbstractParser implements Idiom { } public vsdParser() { - super(); - this.parserName = "Microsoft Visio Parser"; + super("Microsoft Visio Parser"); } /** * returns a hashtable containing the mimetypes that are supported by this class */ - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/xlsParser.java b/source/de/anomic/document/parser/xlsParser.java index 97634b0f0..0330677e8 100644 --- a/source/de/anomic/document/parser/xlsParser.java +++ b/source/de/anomic/document/parser/xlsParser.java @@ -28,8 +28,7 @@ package de.anomic.document.parser; import java.io.InputStream; -import java.util.Hashtable; - +import java.util.HashMap; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFRequest; @@ -57,21 +56,21 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); - static { - SUPPORTED_MIME_TYPES.put("application/msexcel","xls"); - SUPPORTED_MIME_TYPES.put("application/excel","xls"); - SUPPORTED_MIME_TYPES.put("application/vnd.ms-excel","xls"); - SUPPORTED_MIME_TYPES.put("application/x-excel","xls"); - SUPPORTED_MIME_TYPES.put("application/x-msexcel","xls"); - SUPPORTED_MIME_TYPES.put("application/x-ms-excel","xls"); - SUPPORTED_MIME_TYPES.put("application/x-dos_ms_excel","xls"); - SUPPORTED_MIME_TYPES.put("application/xls","xls"); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); + static { + String ext = "xls,xlsx"; + SUPPORTED_MIME_TYPES.put("application/msexcel",ext); + SUPPORTED_MIME_TYPES.put("application/excel",ext); + SUPPORTED_MIME_TYPES.put("application/vnd.ms-excel",ext); + SUPPORTED_MIME_TYPES.put("application/x-excel",ext); + SUPPORTED_MIME_TYPES.put("application/x-msexcel",ext); + SUPPORTED_MIME_TYPES.put("application/x-ms-excel",ext); + SUPPORTED_MIME_TYPES.put("application/x-dos_ms_excel",ext); + SUPPORTED_MIME_TYPES.put("application/xls",ext); } public xlsParser(){ - super(); - this.parserName = "Microsoft Excel Parser"; + super("Microsoft Excel Parser"); } /* @@ -135,7 +134,7 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener { } } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } diff --git a/source/de/anomic/document/parser/zipParser.java b/source/de/anomic/document/parser/zipParser.java index 7b08f58ec..29a2ac431 100644 --- a/source/de/anomic/document/parser/zipParser.java +++ b/source/de/anomic/document/parser/zipParser.java @@ -34,14 +34,12 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; import java.util.HashMap; -import java.util.Hashtable; import java.util.LinkedList; import java.util.Map; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import de.anomic.document.AbstractParser; -import de.anomic.document.Classification; import de.anomic.document.Idiom; import de.anomic.document.Parser; import de.anomic.document.ParserException; @@ -58,7 +56,7 @@ public class zipParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { SUPPORTED_MIME_TYPES.put("application/zip","zip"); SUPPORTED_MIME_TYPES.put("application/x-zip","zip"); @@ -71,11 +69,10 @@ public class zipParser extends AbstractParser implements Idiom { } public zipParser() { - super(); - this.parserName = "Compressed Archive File Parser"; + super("Compressed Archive File Parser"); } - public Hashtable getSupportedMimeTypes() { + public HashMap getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } @@ -118,7 +115,7 @@ public class zipParser extends AbstractParser implements Idiom { final String entryExt = (idx > -1) ? entryName.substring(idx+1) : ""; // trying to determine the mimeType per file extension - final String entryMime = Classification.getMimeTypeByFileExt(entryExt); + final String entryMime = Parser.mimeOf(entryExt); // parsing the content File subDocTempFile = null; diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index ad5df2dc4..a48a8007a 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -231,7 +231,7 @@ public final class httpdFileHandler { } headers.put(httpHeader.SERVER, "AnomicHTTPD (www.anomic.de)"); headers.put(httpHeader.DATE, DateFormatter.formatRFC1123(new Date())); - if(!(Classification.mediaExtContains(ext))){ + if(!(Classification.isMediaExtension(ext))){ headers.put(httpHeader.PRAGMA, "no-cache"); } return headers; diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index cb5e791c2..f365ae71b 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -73,7 +73,7 @@ import java.util.zip.GZIPOutputStream; import de.anomic.crawler.HTTPLoader; import de.anomic.data.Blacklist; -import de.anomic.document.Classification; +import de.anomic.document.Parser; import de.anomic.document.parser.html.ContentTransformer; import de.anomic.document.parser.html.Transformer; import de.anomic.kelondro.util.DateFormatter; @@ -522,13 +522,13 @@ public final class httpdProxyHandler { res.getStatusLine().substring(4), // status text responseHeader); - if(hasBody(res.getStatusCode())) { + if (hasBody(res.getStatusCode())) { final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); final String storeError = cacheEntry.shallStoreCacheForProxy(); final boolean storeHTCache = cacheEntry.profile().storeHTCache(); - final boolean isSupportedContent = Classification.supportedContent(cacheEntry.url(), cacheEntry.getMimeType()); + final boolean isSupportedContent = Parser.supportsExtension(cacheEntry.url()) && Parser.supportsMime(cacheEntry.getMimeType()); if ( /* * Now we store the response into the htcache directory if diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 2dc04f298..66c8bfd6f 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -42,6 +42,7 @@ import java.util.HashMap; import java.util.Map; import de.anomic.document.Classification; +import de.anomic.document.Parser; import de.anomic.http.httpResponseHeader; import de.anomic.http.httpDocument; import de.anomic.kelondro.blob.ArrayStack; @@ -181,7 +182,7 @@ public final class plasmaHTCache { } public static boolean isText(final String mimeType) { - return Classification.supportedMimeTypesContains(mimeType); + return Parser.supportsMime(mimeType); } public static boolean noIndexingURL(final yacyURL url) { @@ -200,7 +201,7 @@ public final class plasmaHTCache { //php - return Classification.mediaExtContains(urlString); + return Classification.isMediaExtension(urlString); } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 5473fbb74..2613f4c31 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -144,7 +144,6 @@ import de.anomic.data.wiki.wikiBoard; import de.anomic.data.wiki.wikiCode; import de.anomic.data.wiki.wikiParser; import de.anomic.document.Condenser; -import de.anomic.document.Classification; import de.anomic.document.Parser; import de.anomic.document.ParserException; import de.anomic.document.Word; @@ -513,18 +512,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String PROXY_ONLINE_CAUTION_DELAY = "onlineCautionDelay"

*

Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds

diff --git a/source/de/anomic/search/SnippetCache.java b/source/de/anomic/search/SnippetCache.java index fff783ec2..489ca74fd 100644 --- a/source/de/anomic/search/SnippetCache.java +++ b/source/de/anomic/search/SnippetCache.java @@ -39,7 +39,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import de.anomic.document.Condenser; -import de.anomic.document.Classification; import de.anomic.document.Parser; import de.anomic.document.ParserException; import de.anomic.document.Word; @@ -867,29 +866,13 @@ public class SnippetCache { // STEP 3: if the metadata is still null try to guess the mimeType of the resource if (responseHeader == null) { - final String filename = url.getFileName(); - final int p = filename.lastIndexOf('.'); - if ( // if no extension is available - (p < 0) || - // or the extension is supported by one of the parsers - ((p >= 0) && (Classification.supportedFileExtContains(filename.substring(p + 1)))) - ) { - String supposedMime = "text/html"; - - // if the mimeType Parser is installed we can set the mimeType to null to force - // a mimetype detection - if (Classification.supportedMimeTypesContains("application/octet-stream")) { - supposedMime = null; - } else if (p != -1){ - // otherwise we try to determine the mimeType per file Extension - supposedMime = Classification.getMimeTypeByFileExt(filename.substring(p + 1)); - } - + if (Parser.supportsExtension(url)) { + String supposedMime = Parser.mimeOf(url); return Parser.parseSource(url, supposedMime, null, contentLength, resourceStream); } return null; } - if (Classification.supportedMimeTypesContains(responseHeader.mime())) { + if (Parser.supportsMime(responseHeader.mime())) { return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream); } return null; diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java index 8bad45c78..b248facf8 100644 --- a/source/de/anomic/tools/mediawikiIndex.java +++ b/source/de/anomic/tools/mediawikiIndex.java @@ -58,7 +58,6 @@ import java.util.concurrent.TimeoutException; import de.anomic.data.wiki.wikiCode; import de.anomic.data.wiki.wikiParser; -import de.anomic.document.Classification; import de.anomic.document.Parser; import de.anomic.document.ParserException; import de.anomic.document.Document; @@ -102,9 +101,6 @@ public class mediawikiIndex extends Thread { this.wparser = new wikiCode(new URL(baseURL).getHost()); this.count = 0; this.start = 0; - // must be called before usage: - Classification.initHTMLParsableMimeTypes("text/html"); - Classification.addParseableMimeTypes("text/html"); } /** @@ -146,8 +142,6 @@ public class mediawikiIndex extends Thread { StringBuilder sb = new StringBuilder(); boolean page = false, text = false; String title = null; - Classification.initHTMLParsableMimeTypes("text/html"); - Classification.addParseableMimeTypes("text/html"); wikiparserrecord poison = newRecord(); int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1); BlockingQueue in = new ArrayBlockingQueue(threads * 10); diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index 188582ea8..623343a6a 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -528,6 +528,13 @@ public class yacyURL implements Serializable { return path.substring(p + 1); // the 'real' file name } + public String getFileExtension() { + String name = getFileName(); + int p = name.lastIndexOf('.'); + if (p < 0) return ""; + return name.substring(p + 1); + } + public String getPath() { return path; }