diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index d35ba6609..027f5a644 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -37,7 +37,7 @@ import java.util.StringTokenizer; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import de.anomic.document.ParserDispatcher; +import de.anomic.document.Classification; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpd; @@ -469,7 +469,7 @@ public class SettingsAck_p { int enabledMimesCount = 0; final StringBuilder currEnabledMimesTxt = new StringBuilder(); - final String[] enabledMimes = ParserDispatcher.setEnabledParserList(newConfig); + final String[] enabledMimes = Classification.setEnabledParserList(newConfig); Arrays.sort(enabledMimes); currEnabledMimesTxt.setLength(0); diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java index 4631c6aee..40857bf38 100644 --- a/htroot/Settings_p.java +++ b/htroot/Settings_p.java @@ -28,8 +28,9 @@ import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; +import de.anomic.document.Classification; +import de.anomic.document.Idiom; import de.anomic.document.Parser; -import de.anomic.document.ParserDispatcher; import de.anomic.http.httpHeader; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; @@ -218,9 +219,9 @@ public final class Settings_p { */ int parserIdx = 0; - final Iterator availableParserIter = ParserDispatcher.availableParserList.values().iterator(); + final Iterator availableParserIter = Parser.availableParserList.values().iterator(); while (availableParserIter.hasNext()) { - final Parser parserInfo = availableParserIter.next(); + final Idiom parserInfo = availableParserIter.next(); prop.put("parser_" + parserIdx + "_name", parserInfo.getName()); int mimeIdx = 0; @@ -228,7 +229,7 @@ public final class Settings_p { while (mimeTypeIter.hasMoreElements()) { final String mimeType = mimeTypeIter.nextElement(); prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType); - prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (ParserDispatcher.supportedMimeTypesContains(mimeType)) ? 1 : 0); + prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", (Classification.supportedMimeTypesContains(mimeType)) ? 1 : 0); mimeIdx++; } prop.put("parser_" + parserIdx + "_mime", mimeIdx); diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/FTPLoader.java index 0bcd1c22e..d32d4d663 100644 --- a/source/de/anomic/crawler/FTPLoader.java +++ b/source/de/anomic/crawler/FTPLoader.java @@ -32,7 +32,7 @@ import java.io.IOException; import java.io.PrintStream; import java.util.Date; -import de.anomic.document.ParserDispatcher; +import de.anomic.document.Classification; import de.anomic.http.httpHeader; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; @@ -218,14 +218,14 @@ public class FTPLoader { private httpDocument getFile(final ftpc ftpClient, final CrawlEntry entry) throws Exception { // determine the mimetype of the resource final yacyURL entryUrl = entry.url(); - final String extension = ParserDispatcher.getFileExt(entryUrl); - final String mimeType = ParserDispatcher.getMimeTypeByFileExt(extension); + final String extension = Classification.getFileExt(entryUrl); + final String mimeType = Classification.getMimeTypeByFileExt(extension); final String path = getPath(entryUrl); // if the mimetype and file extension is supported we start to download // the file httpDocument htCache = null; - if (ParserDispatcher.supportedContent(entryUrl, mimeType)) { + if (Classification.supportedContent(entryUrl, mimeType)) { // aborting download if content is too long final int size = ftpClient.fileSize(path); if (size <= maxFileSize || maxFileSize == -1) { diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index 8b07d17a6..55dbd60d4 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -29,7 +29,7 @@ import java.io.IOException; import java.util.Date; import de.anomic.data.Blacklist; -import de.anomic.document.ParserDispatcher; +import de.anomic.document.Classification; import de.anomic.http.httpClient; import de.anomic.http.httpHeader; import de.anomic.http.httpResponse; @@ -156,7 +156,7 @@ public final class HTTPLoader { // request has been placed and result has been returned. work off response //try { - if (ParserDispatcher.supportedContent(entry.url(), res.getResponseHeader().mime())) { + if (Classification.supportedContent(entry.url(), res.getResponseHeader().mime())) { // get the content length and check if the length is allowed long contentLength = res.getResponseHeader().getContentLength(); diff --git a/source/de/anomic/document/AbstractParser.java b/source/de/anomic/document/AbstractParser.java index d9948f8a7..cf79945e9 100644 --- a/source/de/anomic/document/AbstractParser.java +++ b/source/de/anomic/document/AbstractParser.java @@ -38,12 +38,12 @@ import de.anomic.yacy.yacyURL; import de.anomic.yacy.logging.Log; /** - * New classes implementing the {@link de.anomic.document.Parser} interface + * New classes implementing the {@link de.anomic.document.Idiom} interface * can extend this class to inherit all functions already implemented in this class. * @author Martin Thelian * @version $LastChangedRevision$ / $LastChangedDate$ */ -public abstract class AbstractParser implements Parser { +public abstract class AbstractParser implements Idiom { /** * the logger class that should be used by the parser module for logging @@ -125,9 +125,9 @@ public abstract class AbstractParser implements Parser { // XXX: workaround for relative paths within document + file.getPath().substring(file.getPath().indexOf(File.separatorChar) + 1) + "/" + file.getName()); - final Document subdoc = ParserDispatcher.parseSource( + final Document subdoc = Parser.parseSource( url, - ParserDispatcher.getMimeTypeByFileExt(files[i].substring(files[i].indexOf('.') + 1)), + Classification.getMimeTypeByFileExt(files[i].substring(files[i].indexOf('.') + 1)), null, file); // TODO: change anchors back to use '#' after archive name doc.addSubDocument(subdoc); @@ -150,7 +150,7 @@ public abstract class AbstractParser implements Parser { * and some additional metadata. * @throws ParserException if the content could not be parsed properly * - * @see de.anomic.document.Parser#parse(de.anomic.net.URL, java.lang.String, byte[]) + * @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, byte[]) */ public Document parse( final yacyURL location, @@ -185,7 +185,7 @@ public abstract class AbstractParser implements Parser { * and some additional metadata. * @throws ParserException if the content could not be parsed properly * - * @see de.anomic.document.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.File) + * @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.File) */ public Document parse( final yacyURL location, @@ -220,7 +220,7 @@ public abstract class AbstractParser implements Parser { * and some additional metadata. * @throws ParserException if the content could not be parsed properly * - * @see de.anomic.document.Parser#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream) + * @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream) */ public abstract Document parse(yacyURL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException; diff --git a/source/de/anomic/document/Classification.java b/source/de/anomic/document/Classification.java new file mode 100644 index 000000000..59c662490 --- /dev/null +++ b/source/de/anomic/document/Classification.java @@ -0,0 +1,288 @@ +// Classification.java +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 09.07.2009 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $ +// $LastChangedRevision: 5736 $ +// $LastChangedBy: borg-0300 $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.document; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Properties; +import java.util.Set; + +import de.anomic.yacy.yacyURL; +import de.anomic.yacy.logging.Log; + +public class Classification { + + public static final HashSet supportedHTMLFileExt = new HashSet(); + public static final HashSet supportedHTMLMimeTypes = new HashSet(); + + private static final HashSet mediaExtSet = new HashSet(); + private static final HashSet imageExtSet = new HashSet(); + private static final HashSet audioExtSet = new HashSet(); + private static final HashSet videoExtSet = new HashSet(); + private static final HashSet appsExtSet = new HashSet(); + private static final Properties mimeTypeLookupByFileExt = new Properties(); + + public final static HashSet enabledParserList = new HashSet(); + private final static HashSet supportedFileExt = new HashSet(); + + static { + // load a list of extensions from file + BufferedInputStream bufferedIn = null; + try { + mimeTypeLookupByFileExt.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime")))); + } catch (final IOException e) { + System.err.println("ERROR: httpd.mime not found in settings path"); + } finally { + if (bufferedIn != null) try { + bufferedIn.close(); + } catch (final Exception e) {} + } + + final String apps = "sit,hqx,img,dmg,exe,com,bat,sh,vbs,zip,jar"; + final String audio = "mp2,mp3,ogg,aac,aif,aiff,wav"; + final String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v"; + final String image = "jpg,jpeg,jpe,gif,png,ico,bmp"; + + imageExtSet.addAll(extString2extList(image)); // image formats + audioExtSet.addAll(extString2extList(audio)); // audio formats + videoExtSet.addAll(extString2extList(video)); // video formats + appsExtSet.addAll(extString2extList(apps)); // application formats + + initMediaExt(extString2extList(apps + "," + // application container + "tar,gz,bz2,arj,zip,rar," + // archive formats + "ps,xls,ppt,asf," + // text formats without support + audio + "," + // audio formats + video + "," + // video formats + image // image formats + )); + } + + public static List extString2extList(final String extString) { + final LinkedList extensions = new LinkedList(); + if ((extString == null) || (extString.length() == 0)) { + return extensions; + } + final String[] xs = extString.split(","); + for (int i = 0; i < xs.length; i++) + extensions.add(xs[i].toLowerCase().trim()); + return extensions; + } + + public static void initMediaExt(final List mediaExtList) { + mediaExtSet.addAll(mediaExtList); + } + + public static boolean mediaExtContains(String mediaExt) { + if (mediaExt == null) return false; + mediaExt = mediaExt.trim().toLowerCase(); + + if (supportedHTMLFileExt.contains(mediaExt)) return false; + + if (supportedFileExtContains(mediaExt)) return false; + + return mediaExtSet.contains(mediaExt); + } + + public static boolean imageExtContains(final String imageExt) { + if (imageExt == null) return false; + return imageExtSet.contains(imageExt.trim().toLowerCase()); + } + + public static boolean audioExtContains(final String audioExt) { + if (audioExt == null) return false; + return audioExtSet.contains(audioExt.trim().toLowerCase()); + } + + public static boolean videoExtContains(final String videoExt) { + if (videoExt == null) return false; + return videoExtSet.contains(videoExt.trim().toLowerCase()); + } + + public static boolean appsExtContains(final String appsExt) { + if (appsExt == null) return false; + return appsExtSet.contains(appsExt.trim().toLowerCase()); + } + + public static void initHTMLParsableMimeTypes( + final String htmlParsableMimeTypes) { + final LinkedList mimeTypes = new LinkedList(); + if ((htmlParsableMimeTypes == null) || (htmlParsableMimeTypes.length() == 0)) { + return; + } + final String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes + .split(","); + for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) { + mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim()); + } + supportedHTMLMimeTypes.addAll(mimeTypes); + } + + public static String normalizeMimeType(String mimeType) { + // if (mimeType == null) doMimeTypeAnalysis + if (mimeType == null) mimeType = "application/octet-stream"; + mimeType = mimeType.trim().toLowerCase(); + + final int pos = mimeType.indexOf(';'); + return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); + } + + public static String getMimeTypeByFileExt(final String fileExt) { + return mimeTypeLookupByFileExt.getProperty(fileExt, "application/octet-stream"); + } + + public static void initSupportedHTMLFileExt(final List supportedRealtimeFileExtList) { + supportedHTMLFileExt.addAll(supportedRealtimeFileExtList); + } + + static boolean HTMLParsableMimeTypesContains(String mimeType) { + mimeType = normalizeMimeType(mimeType); + return supportedHTMLMimeTypes.contains(mimeType); + } + + public static boolean supportedContent(final yacyURL url, String mimeType) { + mimeType = Classification.normalizeMimeType(mimeType); + if ( + mimeType.equals("text/html") || + mimeType.equals("application/xhtml+xml") || + mimeType.equals("text/plain") + ) { + return supportedMimeTypesContains(mimeType); + } + return supportedMimeTypesContains(mimeType) && supportedFileExt(url); + } + + public static boolean supportedMimeTypesContains(String mimeType) { + mimeType = Classification.normalizeMimeType(mimeType); + + if (Classification.supportedHTMLMimeTypes.contains(mimeType)) return true; + return enabledParserList.contains(mimeType); + } + + private static boolean supportedFileExt(final yacyURL url) { + if (url == null) throw new NullPointerException(); + + // getting the file path + final String name = getFileExt(url); + return supportedFileExtContains(name); + } + + public static boolean supportedFileExtContains(String fileExt) { + if (fileExt == null) return false; + fileExt = fileExt.trim().toLowerCase(); + if (Classification.supportedHTMLFileExt.contains(fileExt)) return true; + + return supportedFileExt.contains(fileExt); + } + + public static void addParseableMimeTypes(final String enabledMimeTypes) { + HashSet mimeTypes = null; + if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) { + mimeTypes = new HashSet(); + } else { + final String[] enabledMimeTypeList = enabledMimeTypes.split(","); + mimeTypes = new HashSet(enabledMimeTypeList.length); + for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim()); + } + setEnabledParserList(mimeTypes); + } + + public static void enableAllParsers() { + final Set availableMimeTypes = Parser.availableParserList.keySet(); + setEnabledParserList(availableMimeTypes); + } + + public static String[] setEnabledParserList(final Set mimeTypeSet) { + + final HashSet newEnabledParsers = new HashSet(); + final HashSet newSupportedFileExt = new HashSet(); + + if (mimeTypeSet != null) { + final Iterator mimeTypes = mimeTypeSet.iterator(); + while (mimeTypes.hasNext()) { + final String mimeType = mimeTypes.next(); + Idiom theParser = Parser.availableParserList.get(mimeType); + if (theParser != null) { + try { + // getting a list of mimeTypes that the parser supports + final Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes(); + if (parserSupportsMimeTypes != null) { + final Object supportedExtensions = parserSupportsMimeTypes.get(mimeType); + if ((supportedExtensions != null) && + (supportedExtensions instanceof String) && + (((String)supportedExtensions).length() > 0)) { + final String[] extArray = ((String)supportedExtensions).split(","); + newSupportedFileExt.addAll(Arrays.asList(extArray)); + } + } + newEnabledParsers.add(mimeType); + + } catch (final Exception e) { + Log.logSevere("PARSER", "error in setEnabledParserList", e); + } finally { + if (theParser != null) + theParser = null; // destroy object + } + } + } + } + + enabledParserList.addAll(newEnabledParsers); + supportedFileExt.addAll(newSupportedFileExt); + + return newEnabledParsers.toArray(new String[newEnabledParsers.size()]); + } + + @SuppressWarnings("unchecked") + public static HashSet getEnabledParserList() { + return (HashSet) enabledParserList.clone(); + } + + public static String getFileExt(final yacyURL url) { + // getting the file path + String name = url.getPath(); + + // tetermining last position of / in the file path + int p = name.lastIndexOf('/'); + if (p != -1) { + name = name.substring(p); + } + + // termining last position of . in file path + p = name.lastIndexOf('.'); + if (p < 0) + return ""; + return name.substring(p + 1); + } +} diff --git a/source/de/anomic/document/Condenser.java b/source/de/anomic/document/Condenser.java index b13467d64..45f60ec4c 100644 --- a/source/de/anomic/document/Condenser.java +++ b/source/de/anomic/document/Condenser.java @@ -33,7 +33,6 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.RandomAccessFile; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.util.Enumeration; @@ -295,33 +294,14 @@ public final class Condenser { int idx; int wordInSentenceCounter = 1; boolean comb_indexof = false, last_last = false, last_index = false; - RandomAccessFile fa; - final boolean dumpWords = false; final HashMap sentences = new HashMap(); - if (dumpWords) try { - fa = new RandomAccessFile(new File("dump.txt"), "rw"); - fa.seek(fa.length()); - } catch (final IOException e) { - e.printStackTrace(); - fa = null; - } - // read source final sievedWordsEnum wordenum = new sievedWordsEnum(is); while (wordenum.hasMoreElements()) { word = (new String(wordenum.nextElement())).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars? if (languageIdentificator != null) languageIdentificator.add(word); if (word.length() < wordminsize) continue; - //System.out.println("PARSED-WORD " + word); - - //This is useful for testing what YaCy "sees" of a website. - if (dumpWords && fa != null) try { - fa.writeBytes(word); - fa.write(160); - } catch (final IOException e) { - e.printStackTrace(); - } // distinguish punctuation and words wordlen = word.length(); @@ -397,15 +377,6 @@ public final class Condenser { sentences.put(sentence, new Phrase(sentenceHandleCount++)); } } - - if (dumpWords && fa != null) try { - fa.write('\n'); - fa.close(); - } catch (final IOException e) { - e.printStackTrace(); - } - - // ------------------- // we reconstruct the sentence hashtable // and order the entries by the number of the sentence diff --git a/source/de/anomic/document/Document.java b/source/de/anomic/document/Document.java index 5274da9f4..f991befaf 100644 --- a/source/de/anomic/document/Document.java +++ b/source/de/anomic/document/Document.java @@ -30,9 +30,12 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; +import java.net.MalformedURLException; import java.util.Arrays; +import java.util.Collection; import java.util.Date; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -97,7 +100,7 @@ public class Document { this.languages = languages; if (text == null) try { - this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE); + this.text = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); } catch (final IOException e) { e.printStackTrace(); this.text = new StringBuilder(); @@ -371,14 +374,14 @@ dc_rights } else { ext = u.substring(extpos + 1).toLowerCase(); } - if (ParserDispatcher.mediaExtContains(ext)) { + if (Classification.mediaExtContains(ext)) { // this is not a normal anchor, its a media link - if (ParserDispatcher.imageExtContains(ext)) { + if (Classification.imageExtContains(ext)) { ContentScraper.addImage(collectedImages, new ImageEntry(url, entry.getValue(), -1, -1)); } - else if (ParserDispatcher.audioExtContains(ext)) audiolinks.put(url, entry.getValue()); - else if (ParserDispatcher.videoExtContains(ext)) videolinks.put(url, entry.getValue()); - else if (ParserDispatcher.appsExtContains(ext)) applinks.put(url, entry.getValue()); + else if (Classification.audioExtContains(ext)) audiolinks.put(url, entry.getValue()); + else if (Classification.videoExtContains(ext)) videolinks.put(url, entry.getValue()); + else if (Classification.appsExtContains(ext)) applinks.put(url, entry.getValue()); } else { hyperlinks.put(url, entry.getValue()); } @@ -396,21 +399,117 @@ dc_rights // we add artificial hyperlinks to the hyperlink set // that can be calculated from given hyperlinks and imagelinks - hyperlinks.putAll(ParserDispatcher.allReflinks(images.values())); - hyperlinks.putAll(ParserDispatcher.allReflinks(audiolinks.keySet())); - hyperlinks.putAll(ParserDispatcher.allReflinks(videolinks.keySet())); - hyperlinks.putAll(ParserDispatcher.allReflinks(applinks.keySet())); + hyperlinks.putAll(allReflinks(images.values())); + hyperlinks.putAll(allReflinks(audiolinks.keySet())); + hyperlinks.putAll(allReflinks(videolinks.keySet())); + hyperlinks.putAll(allReflinks(applinks.keySet())); /* - hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet())); - hyperlinks.putAll(plasmaParser.allSubpaths(images.values())); - hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet())); - hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet())); - hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet())); + hyperlinks.putAll(allSubpaths(hyperlinks.keySet())); + hyperlinks.putAll(allSubpaths(images.values())); + hyperlinks.putAll(allSubpaths(audiolinks.keySet())); + hyperlinks.putAll(allSubpaths(videolinks.keySet())); + hyperlinks.putAll(allSubpaths(applinks.keySet())); */ // don't do this again this.resorted = true; } + public static Map allSubpaths(final Collection links) { + // links is either a Set of Strings (urls) or a Set of + // htmlFilterImageEntries + final HashSet h = new HashSet(); + Iterator i = links.iterator(); + Object o; + yacyURL url; + String u; + int pos; + int l; + while (i.hasNext()) + try { + o = i.next(); + if (o instanceof yacyURL) url = (yacyURL) o; + else if (o instanceof String) url = new yacyURL((String) o, null); + else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); + else { + assert false; + continue; + } + u = url.toNormalform(true, true); + if (u.endsWith("/")) + u = u.substring(0, u.length() - 1); + pos = u.lastIndexOf('/'); + while (pos > 8) { + l = u.length(); + u = u.substring(0, pos + 1); + h.add(u); + u = u.substring(0, pos); + assert (u.length() < l) : "u = " + u; + pos = u.lastIndexOf('/'); + } + } catch (final MalformedURLException e) { } + // now convert the strings to yacyURLs + i = h.iterator(); + final HashMap v = new HashMap(); + while (i.hasNext()) { + u = (String) i.next(); + try { + url = new yacyURL(u, null); + v.put(url, "sub"); + } catch (final MalformedURLException e) { + } + } + return v; + } + + public static Map allReflinks(final Collection links) { + // links is either a Set of Strings (with urls) or + // htmlFilterImageEntries + // we find all links that are part of a reference inside a url + final HashMap v = new HashMap(); + final Iterator i = links.iterator(); + Object o; + yacyURL url; + String u; + int pos; + loop: while (i.hasNext()) + try { + o = i.next(); + if (o instanceof yacyURL) + url = (yacyURL) o; + else if (o instanceof String) + url = new yacyURL((String) o, null); + else if (o instanceof ImageEntry) + url = ((ImageEntry) o).url(); + else { + assert false; + continue; + } + u = url.toNormalform(true, true); + if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) { + i.remove(); + u = u.substring(pos); + while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) + u = u.substring(pos); + url = new yacyURL(u, null); + if (!(v.containsKey(url))) + v.put(url, "ref"); + continue loop; + } + if ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) { + i.remove(); + u = "http:/" + u.substring(pos); + while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) + u = "http:/" + u.substring(pos); + url = new yacyURL(u, null); + if (!(v.containsKey(url))) + v.put(url, "ref"); + continue loop; + } + } catch (final MalformedURLException e) { + } + return v; + } + public void addSubDocument(final Document doc) throws IOException { this.sections.addAll(Arrays.asList(doc.getSectionTitles())); @@ -423,7 +522,7 @@ dc_rights this.description.append(doc.dc_description()); if (!(this.text instanceof serverCachedFileOutputStream)) { - this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE); + this.text = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); FileUtils.copy(getText(), (serverCachedFileOutputStream)this.text); } FileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text); diff --git a/source/de/anomic/document/Idiom.java b/source/de/anomic/document/Idiom.java new file mode 100644 index 000000000..64cd92617 --- /dev/null +++ b/source/de/anomic/document/Idiom.java @@ -0,0 +1,108 @@ +//Idiom.java +//------------------------ +//part of YaCy +//(C) by Michael Peter Christen; mc@yacy.net +//first published on http://www.anomic.de +//Frankfurt, Germany, 2005 +// +//this file was contributed by Martin Thelian +//last major change: $LastChangedDate$ by $LastChangedBy$ +//Revision: $LastChangedRevision$ +// +//This program is free software; you can redistribute it and/or modify +//it under the terms of the GNU General Public License as published by +//the Free Software Foundation; either version 2 of the License, or +//(at your option) any later version. +// +//This program is distributed in the hope that it will be useful, +//but WITHOUT ANY WARRANTY; without even the implied warranty of +//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//GNU General Public License for more details. +// +//You should have received a copy of the GNU General Public License +//along with this program; if not, write to the Free Software +//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.document; + +import java.io.File; +import java.io.InputStream; +import java.util.Hashtable; + +import de.anomic.yacy.yacyURL; + +/** + * This interface defines a list of methods that needs to be implemented + * by each content parser class. + * @author Martin Thelian + * @version $LastChangedRevision$ / $LastChangedDate$ + */ +public interface Idiom { + + + public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024; + + /** + * Parsing a document available as byte array + * @param location the origin of the document + * @param mimeType the mimetype of the document + * @param charset the supposed charset of the document or null if unkown + * @param source the content byte array + * @return a {@link Document} containing the extracted plain text of the document + * and some additional metadata. + * + * @throws ParserException if the content could not be parsed properly + */ + public Document parse(yacyURL location, String mimeType, String charset, byte[] source) + throws ParserException, InterruptedException; + + /** + * Parsing a document stored in a {@link File} + * @param location the origin of the document + * @param mimeType the mimetype of the document + * @param charset the supposed charset of the document or null if unkown + * @param sourceFile the file containing the content of the document + * @return a {@link Document} containing the extracted plain text of the document + * and some additional metadata. + * + * @throws ParserException if the content could not be parsed properly + */ + public Document parse(yacyURL location, String mimeType, String charset, File sourceFile) + throws ParserException, InterruptedException; + + /** + * Parsing a document available as {@link InputStream} + * @param location the origin of the document + * @param mimeType the mimetype of the document + * @param charset the supposed charset of the document or null if unkown + * @param source the {@link InputStream} containing the document content + * @return a {@link Document} containing the extracted plain text of the document + * and some additional metadata. + * + * @throws ParserException if the content could not be parsed properly + */ + public Document parse(yacyURL location, String mimeType, String charset, InputStream source) + throws ParserException, InterruptedException; + + /** + * Can be used to determine the MimeType(s) that are supported by the parser + * @return a {@link Hashtable} containing a list of MimeTypes that are supported by + * the parser + */ + public Hashtable getSupportedMimeTypes(); + + /** + * This function should be called before reusing the parser object. + */ + public void reset(); + + public void setContentLength(long length); + + /** + * Returns the name of the parser + * @return parser name + */ + public String getName(); +} + + diff --git a/source/de/anomic/document/Parser.java b/source/de/anomic/document/Parser.java index 50658c8b4..579a05a9e 100644 --- a/source/de/anomic/document/Parser.java +++ b/source/de/anomic/document/Parser.java @@ -1,108 +1,190 @@ -//Parser.java -//------------------------ -//part of YaCy -//(C) by Michael Peter Christen; mc@yacy.net -//first published on http://www.anomic.de -//Frankfurt, Germany, 2005 -// -//this file was contributed by Martin Thelian -//last major change: $LastChangedDate$ by $LastChangedBy$ -//Revision: $LastChangedRevision$ -// -//This program is free software; you can redistribute it and/or modify -//it under the terms of the GNU General Public License as published by -//the Free Software Foundation; either version 2 of the License, or -//(at your option) any later version. -// -//This program is distributed in the hope that it will be useful, -//but WITHOUT ANY WARRANTY; without even the implied warranty of -//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. -// -//You should have received a copy of the GNU General Public License -//along with this program; if not, write to the Free Software -//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.document; - -import java.io.File; -import java.io.InputStream; -import java.util.Hashtable; - -import de.anomic.yacy.yacyURL; - -/** - * This interface defines a list of methods that needs to be implemented - * by each content parser class. - * @author Martin Thelian - * @version $LastChangedRevision$ / $LastChangedDate$ - */ -public interface Parser { - - - public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024; - - /** - * Parsing a document available as byte array - * @param location the origin of the document - * @param mimeType the mimetype of the document - * @param charset the supposed charset of the document or null if unkown - * @param source the content byte array - * @return a {@link Document} containing the extracted plain text of the document - * and some additional metadata. - * - * @throws ParserException if the content could not be parsed properly - */ - public Document parse(yacyURL location, String mimeType, String charset, byte[] source) - throws ParserException, InterruptedException; - - /** - * Parsing a document stored in a {@link File} - * @param location the origin of the document - * @param mimeType the mimetype of the document - * @param charset the supposed charset of the document or null if unkown - * @param sourceFile the file containing the content of the document - * @return a {@link Document} containing the extracted plain text of the document - * and some additional metadata. - * - * @throws ParserException if the content could not be parsed properly - */ - public Document parse(yacyURL location, String mimeType, String charset, File sourceFile) - throws ParserException, InterruptedException; - - /** - * Parsing a document available as {@link InputStream} - * @param location the origin of the document - * @param mimeType the mimetype of the document - * @param charset the supposed charset of the document or null if unkown - * @param source the {@link InputStream} containing the document content - * @return a {@link Document} containing the extracted plain text of the document - * and some additional metadata. - * - * @throws ParserException if the content could not be parsed properly - */ - public Document parse(yacyURL location, String mimeType, String charset, InputStream source) - throws ParserException, InterruptedException; - - /** - * Can be used to determine the MimeType(s) that are supported by the parser - * @return a {@link Hashtable} containing a list of MimeTypes that are supported by - * the parser - */ - public Hashtable getSupportedMimeTypes(); - - /** - * This function should be called before reusing the parser object. - */ - public void reset(); - - public void setContentLength(long length); - - /** - * Returns the name of the parser - * @return parser name - */ - public String getName(); -} - - +// Parser.java +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 09.07.2009 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $ +// $LastChangedRevision: 5736 $ +// $LastChangedBy: borg-0300 $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.document; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Hashtable; +import java.util.Iterator; + +import de.anomic.document.parser.bzipParser; +import de.anomic.document.parser.docParser; +import de.anomic.document.parser.gzipParser; +import de.anomic.document.parser.htmlParser; +import de.anomic.document.parser.mimeTypeParser; +import de.anomic.document.parser.odtParser; +import de.anomic.document.parser.pdfParser; +import de.anomic.document.parser.pptParser; +import de.anomic.document.parser.psParser; +import de.anomic.document.parser.rpmParser; +import de.anomic.document.parser.rssParser; +import de.anomic.document.parser.rtfParser; +import de.anomic.document.parser.sevenzipParser; +import de.anomic.document.parser.swfParser; +import de.anomic.document.parser.tarParser; +import de.anomic.document.parser.vcfParser; +import de.anomic.document.parser.vsdParser; +import de.anomic.document.parser.xlsParser; +import de.anomic.document.parser.zipParser; +import de.anomic.yacy.yacyURL; +import de.anomic.yacy.logging.Log; + +public final class Parser { + + private static final Log theLogger = new Log("PARSER"); + public static final HashMap availableParserList = new HashMap(); + + static { + initParser(new bzipParser()); + initParser(new docParser()); + initParser(new gzipParser()); + initParser(new mimeTypeParser()); + initParser(new odtParser()); + initParser(new pdfParser()); + initParser(new pptParser()); + initParser(new psParser()); + initParser(new rpmParser()); + initParser(new rssParser()); + initParser(new rtfParser()); + initParser(new sevenzipParser()); + initParser(new swfParser()); + initParser(new tarParser()); + initParser(new vcfParser()); + initParser(new vsdParser()); + initParser(new xlsParser()); + initParser(new zipParser()); + } + + private static void initParser(Idiom theParser) { + final Hashtable supportedMimeTypes = theParser.getSupportedMimeTypes(); + final Iterator mimeTypeIterator = supportedMimeTypes.keySet().iterator(); + while (mimeTypeIterator.hasNext()) { + final String mimeType = mimeTypeIterator.next(); + availableParserList.put(mimeType, theParser); + Log.logInfo("PARSER", "Found parser for mimeType '" + mimeType + "': " + theParser.getName()); + } + } + + public static Document parseSource(final yacyURL location, + final String mimeType, final String charset, + final byte[] sourceArray) throws InterruptedException, + ParserException { + ByteArrayInputStream byteIn = null; + try { + if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from byte-array"); + if (sourceArray == null || sourceArray.length == 0) { + final String errorMsg = "No resource content available (1) " + (((sourceArray == null) ? "source == null" : "source.length() == 0") + ", url = " + location.toNormalform(true, false)); + theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg, location, errorMsg); + } + byteIn = new ByteArrayInputStream(sourceArray); + return parseSource(location, mimeType, charset, sourceArray.length, byteIn); + } catch (final Exception e) { + if (e instanceof InterruptedException) throw (InterruptedException) e; + if (e instanceof ParserException) throw (ParserException) e; + theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e); + throw new ParserException("Unexpected exception while parsing " + location, location, e); + } finally { + if (byteIn != null) try { + byteIn.close(); + } catch (final Exception ex) { } + } + } + + public static Document parseSource(final yacyURL location, + final String mimeType, final String charset, + final File sourceFile) throws InterruptedException, ParserException { + + BufferedInputStream sourceStream = null; + try { + if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from file"); + if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { + final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2)."; + theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg, location, "document has no content"); + } + sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); + return parseSource(location, mimeType, charset, sourceFile.length(), sourceStream); + } catch (final Exception e) { + if (e instanceof InterruptedException) throw (InterruptedException) e; + if (e instanceof ParserException) throw (ParserException) e; + theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e); + throw new ParserException("Unexpected exception while parsing " + location, location, e); + } finally { + if (sourceStream != null)try { + sourceStream.close(); + } catch (final Exception ex) {} + } + } + + public static Document parseSource(final yacyURL location, + String mimeType, final String charset, + final long contentLength, final InputStream sourceStream) + throws InterruptedException, ParserException { + try { + if (theLogger.isFine()) theLogger.logFine("Parsing '" + location + "' from stream"); + mimeType = Classification.normalizeMimeType(mimeType); + final String fileExt = Classification.getFileExt(location); + final String documentCharset = htmlParser.patchCharsetEncoding(charset); + if (!Classification.supportedContent(location, mimeType)) { + final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (1)"; + theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg, location, "wrong mime type or wrong extension"); + } + if (theLogger.isFine()) theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); + Idiom parser = availableParserList.get(Classification.normalizeMimeType(mimeType)); + Document doc = null; + if (parser != null) { + parser.setContentLength(contentLength); + doc = parser.parse(location, mimeType, documentCharset, sourceStream); + } else if (Classification.HTMLParsableMimeTypesContains(mimeType)) { + doc = new htmlParser().parse(location, mimeType, documentCharset, sourceStream); + } else { + final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)"; + theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg, location, "wrong mime type or wrong extension"); + } + if (doc == null) { + final String errorMsg = "Unexpected error. Parser returned null."; + theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg, location); + } + return doc; + } catch (final Exception e) { + if (e instanceof InterruptedException) throw (InterruptedException) e; + if (e instanceof ParserException) throw (ParserException) e; + final String errorMsg = "Unexpected exception. " + e.getMessage(); + theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e); + throw new ParserException(errorMsg, location, e); + } + } + +} diff --git a/source/de/anomic/document/ParserConfig.java b/source/de/anomic/document/ParserConfig.java deleted file mode 100644 index 38bcdb059..000000000 --- a/source/de/anomic/document/ParserConfig.java +++ /dev/null @@ -1,174 +0,0 @@ -// plasmaParserConfig.java -// ------------------------------------- -// part of YACY -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// -// This file ist contributed by Martin Thelian -// -// $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $ -// $LastChangedRevision: 1715 $ -// $LastChangedBy: theli $ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.document; - -import java.util.Arrays; -import java.util.HashSet; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.Set; - -import de.anomic.yacy.yacyURL; -import de.anomic.yacy.logging.Log; - -public class ParserConfig { - - /** - * A list containing all enabled parsers and the mimeType that they can handle - * @see #loadEnabledParserList() - * @see #setEnabledParserList(Enumeration) - */ - public final HashSet enabledParserList; - - /** - * A list of file extensions that are supported by all enabled parsers - */ - private final HashSet supportedFileExt; - - public ParserConfig() { - supportedFileExt = new HashSet(); - enabledParserList = new HashSet(); - } - - public boolean supportedContent(final yacyURL url, String mimeType) { - // TODO: we need some exceptions here to index URLs like this - // http://www.musicabona.com/respighi/12668/cd/index.html.fr - mimeType = ParserDispatcher.normalizeMimeType(mimeType); - if ( - mimeType.equals("text/html") || - mimeType.equals("application/xhtml+xml") || - mimeType.equals("text/plain") - ) { - return supportedMimeTypesContains(mimeType); - } - return supportedMimeTypesContains(mimeType) && supportedFileExt(url); - } - - public boolean supportedMimeTypesContains(String mimeType) { - mimeType = ParserDispatcher.normalizeMimeType(mimeType); - - synchronized (ParserDispatcher.supportedHTMLMimeTypes) { - if (ParserDispatcher.supportedHTMLMimeTypes.contains(mimeType)) return true; - } - - synchronized (this.enabledParserList) { - return this.enabledParserList.contains(mimeType); - } - } - - private boolean supportedFileExt(final yacyURL url) { - if (url == null) throw new NullPointerException(); - - // getting the file path - final String name = ParserDispatcher.getFileExt(url); - return supportedFileExtContains(name); - } - - public boolean supportedFileExtContains(String fileExt) { - if (fileExt == null) return false; - fileExt = fileExt.trim().toLowerCase(); - - synchronized (ParserDispatcher.supportedHTMLFileExt) { - if (ParserDispatcher.supportedHTMLFileExt.contains(fileExt)) return true; - } - - synchronized(this.supportedFileExt) { - return this.supportedFileExt.contains(fileExt); - } - } - - public void addParseableMimeTypes(final String enabledMimeTypes) { - HashSet mimeTypes = null; - if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) { - mimeTypes = new HashSet(); - } else { - final String[] enabledMimeTypeList = enabledMimeTypes.split(","); - mimeTypes = new HashSet(enabledMimeTypeList.length); - for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim()); - } - setEnabledParserList(mimeTypes); - } - - public void enableAllParsers() { - final Set availableMimeTypes = ParserDispatcher.availableParserList.keySet(); - setEnabledParserList(availableMimeTypes); - } - - public String[] setEnabledParserList(final Set mimeTypeSet) { - - final HashSet newEnabledParsers = new HashSet(); - final HashSet newSupportedFileExt = new HashSet(); - - if (mimeTypeSet != null) { - final Iterator mimeTypes = mimeTypeSet.iterator(); - while (mimeTypes.hasNext()) { - final String mimeType = mimeTypes.next(); - Parser theParser = ParserDispatcher.availableParserList.get(mimeType); - if (theParser != null) { - try { - // getting a list of mimeTypes that the parser supports - final Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes(); - if (parserSupportsMimeTypes != null) { - final Object supportedExtensions = parserSupportsMimeTypes.get(mimeType); - if ((supportedExtensions != null) && - (supportedExtensions instanceof String) && - (((String)supportedExtensions).length() > 0)) { - final String[] extArray = ((String)supportedExtensions).split(","); - newSupportedFileExt.addAll(Arrays.asList(extArray)); - } - } - newEnabledParsers.add(mimeType); - - } catch (final Exception e) { - Log.logSevere("PARSER", "error in setEnabledParserList", e); - } finally { - if (theParser != null) - theParser = null; // destroy object - } - } - } - } - - synchronized (this.enabledParserList) { - this.enabledParserList.addAll(newEnabledParsers); - } - - synchronized (this.supportedFileExt) { - this.supportedFileExt.addAll(newSupportedFileExt); - } - - return newEnabledParsers.toArray(new String[newEnabledParsers.size()]); - } - - @SuppressWarnings("unchecked") - public HashSet getEnabledParserList() { - synchronized (this.enabledParserList) { - return (HashSet) this.enabledParserList.clone(); - } - } -} \ No newline at end of file diff --git a/source/de/anomic/document/ParserDispatcher.java b/source/de/anomic/document/ParserDispatcher.java deleted file mode 100644 index 70700a208..000000000 --- a/source/de/anomic/document/ParserDispatcher.java +++ /dev/null @@ -1,576 +0,0 @@ - -package de.anomic.document; - -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.MalformedURLException; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.Set; - -import de.anomic.document.parser.bzipParser; -import de.anomic.document.parser.docParser; -import de.anomic.document.parser.gzipParser; -import de.anomic.document.parser.htmlParser; -import de.anomic.document.parser.mimeTypeParser; -import de.anomic.document.parser.odtParser; -import de.anomic.document.parser.pdfParser; -import de.anomic.document.parser.pptParser; -import de.anomic.document.parser.psParser; -import de.anomic.document.parser.rpmParser; -import de.anomic.document.parser.rssParser; -import de.anomic.document.parser.rtfParser; -import de.anomic.document.parser.sevenzipParser; -import de.anomic.document.parser.swfParser; -import de.anomic.document.parser.tarParser; -import de.anomic.document.parser.vcfParser; -import de.anomic.document.parser.vsdParser; -import de.anomic.document.parser.xlsParser; -import de.anomic.document.parser.zipParser; -import de.anomic.document.parser.html.ImageEntry; -import de.anomic.yacy.yacyURL; -import de.anomic.yacy.logging.Log; - -public final class ParserDispatcher { - - public static final ParserConfig parserConfig = new ParserConfig(); - - /** - * A list containing all installed parsers and the mimeType that they support - * @see #loadAvailableParserList() - */ - public static final HashMap availableParserList = new HashMap(); - - /** - * A list of file extensions and mime types that are supported by the html-parser - */ - public static final HashSet supportedHTMLFileExt = new HashSet(); - public static final HashSet supportedHTMLMimeTypes = new HashSet(); - - private static final Properties mimeTypeLookupByFileExt = new Properties(); - static { - // loading a list of extensions from file - BufferedInputStream bufferedIn = null; - try { - mimeTypeLookupByFileExt.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("httpd.mime")))); - } catch (final IOException e) { - System.err.println("ERROR: httpd.mime not found in settings path"); - } finally { - if (bufferedIn != null) try{bufferedIn.close();}catch(final Exception e){} - } - } - - /** - * A list of media extensions that should not be handled by the Parser - */ - private static final HashSet mediaExtSet = new HashSet(); - - /** - * A list of image, audio, video and application extensions - */ - private static final HashSet imageExtSet = new HashSet(); - private static final HashSet audioExtSet = new HashSet(); - private static final HashSet videoExtSet = new HashSet(); - private static final HashSet appsExtSet = new HashSet(); - - - /** - * Initializing the - * @see #initMediaExt(String) - */ - static { - final String apps = "sit,hqx,img,dmg,exe,com,bat,sh,vbs,zip,jar"; - final String audio = "mp2,mp3,ogg,aac,aif,aiff,wav"; - final String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v"; - final String image = "jpg,jpeg,jpe,gif,png,ico,bmp"; - initMediaExt(extString2extList( - apps + "," + // application container - "tar,gz,bz2,arj,zip,rar," + // archive formats - "ps,xls,ppt,asf," + // text formats without support - audio + "," + // audio formats - video + "," + // video formats - image // image formats - )); - initImageExt(extString2extList(image)); // image formats - initAudioExt(extString2extList(audio)); // audio formats - initVideoExt(extString2extList(video)); // video formats - initAppsExt(extString2extList(apps)); // application formats - - /* =================================================== - * loading a list of available parsers - * =================================================== */ - loadAvailableParserList(); - } - - private static final Log theLogger = new Log("PARSER"); - - - /** - * This function is used to initialize the HTMLParsableMimeTypes List. - * This list contains a list of mimeTypes that can be parsed in realtime by - * the yacy html-Parser - * @param htmlParsableMimeTypes a list of mimetypes that can be parsed by the - * yacy html parser - */ - public static void initHTMLParsableMimeTypes(final String htmlParsableMimeTypes) { - final LinkedList mimeTypes = new LinkedList(); - if ((htmlParsableMimeTypes == null) || (htmlParsableMimeTypes.length() == 0)) { - return; - } - final String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes.split(","); - for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) { - mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase().trim()); - } - synchronized (supportedHTMLMimeTypes) { - supportedHTMLMimeTypes.addAll(mimeTypes); - } - } - - public static List extString2extList(final String extString) { - final LinkedList extensions = new LinkedList(); - if ((extString == null) || (extString.length() == 0)) { - return extensions; - } - final String[] xs = extString.split(","); - for (int i = 0; i < xs.length; i++) extensions.add(xs[i].toLowerCase().trim()); - return extensions; - } - - public static void initMediaExt(final List mediaExtList) { - synchronized (mediaExtSet) { - mediaExtSet.addAll(mediaExtList); - } - } - - private static void initImageExt(final List imageExtList) { - synchronized (imageExtSet) { - imageExtSet.addAll(imageExtList); - } - } - - private static void initAudioExt(final List audioExtList) { - synchronized (audioExtSet) { - audioExtSet.addAll(audioExtList); - } - } - - private static void initVideoExt(final List videoExtList) { - synchronized (videoExtSet) { - videoExtSet.addAll(videoExtList); - } - } - - private static void initAppsExt(final List appsExtList) { - synchronized (appsExtSet) { - appsExtSet.addAll(appsExtList); - } - } - - public static void initSupportedHTMLFileExt(final List supportedRealtimeFileExtList) { - synchronized (supportedHTMLFileExt) { - supportedHTMLFileExt.addAll(supportedRealtimeFileExtList); - } - } - - private static boolean HTMLParsableMimeTypesContains(String mimeType) { - mimeType = normalizeMimeType(mimeType); - synchronized (supportedHTMLMimeTypes) { - return supportedHTMLMimeTypes.contains(mimeType); - } - } - - public static String getFileExt(final yacyURL url) { - // getting the file path - String name = url.getPath(); - - // tetermining last position of / in the file path - int p = name.lastIndexOf('/'); - if (p != -1) { - name = name.substring(p); - } - - // termining last position of . in file path - p = name.lastIndexOf('.'); - if (p < 0) return ""; - return name.substring(p + 1); - } - - public static boolean mediaExtContains(String mediaExt) { - if (mediaExt == null) return false; - mediaExt = mediaExt.trim().toLowerCase(); - - synchronized (supportedHTMLFileExt) { - if (supportedHTMLFileExt.contains(mediaExt)) return false; - } - - if (supportedFileExtContains(mediaExt)) return false; - - synchronized (mediaExtSet) { - return mediaExtSet.contains(mediaExt); - } - } - - public static boolean imageExtContains(final String imageExt) { - if (imageExt == null) return false; - synchronized (imageExtSet) { - return imageExtSet.contains(imageExt.trim().toLowerCase()); - } - } - - public static boolean audioExtContains(final String audioExt) { - if (audioExt == null) return false; - synchronized (audioExtSet) { - return audioExtSet.contains(audioExt.trim().toLowerCase()); - } - } - - public static boolean videoExtContains(final String videoExt) { - if (videoExt == null) return false; - synchronized (videoExtSet) { - return videoExtSet.contains(videoExt.trim().toLowerCase()); - } - } - - public static boolean appsExtContains(final String appsExt) { - if (appsExt == null) return false; - synchronized (appsExtSet) { - return appsExtSet.contains(appsExt.trim().toLowerCase()); - } - } - - public static String normalizeMimeType(String mimeType) { - //if (mimeType == null) doMimeTypeAnalysis - if (mimeType == null) mimeType = "application/octet-stream"; - mimeType = mimeType.trim().toLowerCase(); - - final int pos = mimeType.indexOf(';'); - return ((pos < 0) ? mimeType : mimeType.substring(0, pos)); - } - - public static String getMimeTypeByFileExt(final String fileExt) { - return mimeTypeLookupByFileExt.getProperty(fileExt,"application/octet-stream"); - } - - public static HashMap getAvailableParserList() { - return availableParserList; - } - - private static void loadAvailableParserList() { - initParser(new bzipParser()); - initParser(new docParser()); - initParser(new gzipParser()); - initParser(new mimeTypeParser()); - initParser(new odtParser()); - initParser(new pdfParser()); - initParser(new pptParser()); - initParser(new psParser()); - initParser(new rpmParser()); - initParser(new rssParser()); - initParser(new rtfParser()); - initParser(new sevenzipParser()); - initParser(new swfParser()); - initParser(new tarParser()); - initParser(new vcfParser()); - initParser(new vsdParser()); - initParser(new xlsParser()); - initParser(new zipParser()); - } - - private static void initParser(Parser theParser) { - // loading the list of mime-types that are supported by this parser class - final Hashtable supportedMimeTypes = theParser.getSupportedMimeTypes(); - - final Iterator mimeTypeIterator = supportedMimeTypes.keySet().iterator(); - while (mimeTypeIterator.hasNext()) { - final String mimeType = mimeTypeIterator.next(); - availableParserList.put(mimeType, theParser); - Log.logInfo("PARSER", "Found parser for mimeType '" + mimeType + "'." + - "\n\tName: " + theParser.getName()); - } - } - - public static Document parseSource(final yacyURL location, final String mimeType, final String charset, final byte[] sourceArray) - throws InterruptedException, ParserException { - ByteArrayInputStream byteIn = null; - try { - if (theLogger.isFine()) - theLogger.logFine("Parsing '" + location + "' from byte-array"); - - // testing if the resource is not empty - if (sourceArray == null || sourceArray.length == 0) { - final String errorMsg = "No resource content available (1) " + (((sourceArray == null) ? "source == null" : "source.length() == 0") + ", url = " + location.toNormalform(true, false)); - theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location, errorMsg); - } - - // creating an InputStream - byteIn = new ByteArrayInputStream(sourceArray); - - // parsing the temp file - return parseSource(location, mimeType, charset, sourceArray.length, byteIn); - - } catch (final Exception e) { - // Interrupted- and Parser-Exceptions should pass through - if (e instanceof InterruptedException) throw (InterruptedException) e; - if (e instanceof ParserException) throw (ParserException) e; - - // log unexpected error - theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e); - throw new ParserException("Unexpected exception while parsing " + location,location, e); - } finally { - if (byteIn != null) try { byteIn.close(); } catch (final Exception ex){/* ignore this */} - } - - } - - public static Document parseSource(final yacyURL location, final String theMimeType, final String theDocumentCharset, final File sourceFile) throws InterruptedException, ParserException { - - BufferedInputStream sourceStream = null; - try { - if (theLogger.isFine()) - theLogger.logFine("Parsing '" + location + "' from file"); - - // testing if the resource is not empty - if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { - final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2)."; - theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location, "document has no content"); - } - - // create a new InputStream - sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); - - // parsing the data - return parseSource(location, theMimeType, theDocumentCharset, sourceFile.length(), sourceStream); - - } catch (final Exception e) { - // Interrupted- and Parser-Exceptions should pass through - if (e instanceof InterruptedException) throw (InterruptedException) e; - if (e instanceof ParserException) throw (ParserException) e; - - // log unexpected error - theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e); - throw new ParserException("Unexpected exception while parsing " + location,location, e); - } finally { - if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex){/* ignore this */} - } - } - - /** - * To parse a resource from an {@link InputStream} - * @param location the URL of the resource - * @param theMimeType the resource mimetype (null if unknown) - * @param theDocumentCharset the charset of the resource (null if unknown) - * @param contentLength the content length of the resource (-1 if unknown) - * @param sourceStream an {@link InputStream} containing the resource body - * @return the parsed {@link ParserDocument document} - * @throws InterruptedException - * @throws ParserException - */ - public static Document parseSource(final yacyURL location, final String theMimeType, final String theDocumentCharset, final long contentLength, final InputStream sourceStream) throws InterruptedException, ParserException { - Parser theParser = null; - String mimeType = null; - try { - if (theLogger.isFine()) - theLogger.logFine("Parsing '" + location + "' from stream"); - - // getting the mimetype of the document - mimeType = normalizeMimeType(theMimeType); - - // getting the file extension of the document - final String fileExt = getFileExt(location); - - // getting the charset of the document - // TODO: do a charset detection here .... - final String documentCharset = htmlParser.patchCharsetEncoding(theDocumentCharset); - - // testing if parsing is supported for this resource - if (!supportedContent(location,mimeType)) { - final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (1)"; - theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location, "wrong mime type or wrong extension"); - } - - if (theLogger.isFine()) - theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType + - "' and file extension '" + fileExt + "'."); - - // getting the correct parser for the given mimeType - theParser = getParser(mimeType); - - // if a parser was found we use it ... - Document doc = null; - if (theParser != null) { - // set the content length of the resource - theParser.setContentLength(contentLength); - // parse the resource - doc = theParser.parse(location, mimeType,documentCharset,sourceStream); - } else if (HTMLParsableMimeTypesContains(mimeType)) { - doc = new htmlParser().parse(location, mimeType, documentCharset, sourceStream); - } else { - final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)"; - theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location, "wrong mime type or wrong extension"); - } - - // check result - if (doc == null) { - final String errorMsg = "Unexpected error. Parser returned null."; - theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg,location); - } - return doc; - - } catch (final Exception e) { - // Interrupted- and Parser-Exceptions should pass through - if (e instanceof InterruptedException) throw (InterruptedException) e; - if (e instanceof ParserException) throw (ParserException) e; - - // log unexpected error - final String errorMsg = "Unexpected exception. " + e.getMessage(); - theLogger.logSevere("Unable to parse '" + location + "'. " + errorMsg, e); - throw new ParserException(errorMsg,location,e); - - } finally { - if (theParser != null) { - theParser = null; // delete object - } - } - } - - - - - /** - * This function is used to determine the parser class that should be used for a given - * mimetype ... - * @param mimeType MIME-Type of the resource - * @return the {@link Parser}-class that is supposed to parse the resource of - * the given MIME-Type - */ - private static Parser getParser(String mimeType) { - - mimeType = normalizeMimeType(mimeType); - - // determining the proper parser class name for the mimeType - return availableParserList.get(mimeType); - } - - public static Map allReflinks(final Collection links) { - // links is either a Set of Strings (with urls) or htmlFilterImageEntries - // we find all links that are part of a reference inside a url - final HashMap v = new HashMap(); - final Iterator i = links.iterator(); - Object o; - yacyURL url; - String u; - int pos; - loop: while (i.hasNext()) try { - o = i.next(); - if (o instanceof yacyURL) url = (yacyURL) o; - else if (o instanceof String) url = new yacyURL((String) o, null); - else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); - else { - assert false; - continue; - } - u = url.toNormalform(true, true); - if ((pos = u.toLowerCase().indexOf("http://",7)) > 0) { - i.remove(); - u = u.substring(pos); - while ((pos = u.toLowerCase().indexOf("http://",7)) > 0) u = u.substring(pos); - url = new yacyURL(u, null); - if (!(v.containsKey(url))) v.put(url, "ref"); - continue loop; - } - if ((pos = u.toLowerCase().indexOf("/www.",7)) > 0) { - i.remove(); - u = "http:/" + u.substring(pos); - while ((pos = u.toLowerCase().indexOf("/www.",7)) > 0) u = "http:/" + u.substring(pos); - url = new yacyURL(u, null); - if (!(v.containsKey(url))) v.put(url, "ref"); - continue loop; - } - } catch (final MalformedURLException e) {} - return v; - } - - static Map allSubpaths(final Collection links) { - // links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries - final HashSet h = new HashSet(); - Iterator i = links.iterator(); - Object o; - yacyURL url; - String u; - int pos; - int l; - while (i.hasNext()) try { - o = i.next(); - if (o instanceof yacyURL) url = (yacyURL) o; - else if (o instanceof String) url = new yacyURL((String) o, null); - else if (o instanceof ImageEntry) url = ((ImageEntry) o).url(); - else { - assert false; - continue; - } - u = url.toNormalform(true, true); - if (u.endsWith("/")) u = u.substring(0, u.length() - 1); - pos = u.lastIndexOf('/'); - while (pos > 8) { - l = u.length(); - u = u.substring(0, pos + 1); - h.add(u); - u = u.substring(0, pos); - assert (u.length() < l) : "u = " + u; - pos = u.lastIndexOf('/'); - } - } catch (final MalformedURLException e) {} - // now convert the strings to yacyURLs - i = h.iterator(); - final HashMap v = new HashMap(); - while (i.hasNext()) { - u = (String) i.next(); - try { - url = new yacyURL(u, null); - v.put(url, "sub"); - } catch (final MalformedURLException e) {} - } - return v; - } - - public static boolean supportedContent(final yacyURL url, final String mimeType) { - if (url == null) throw new NullPointerException(); - - if (parserConfig.supportedContent(url, mimeType)) return true; - - return false; - } - - public static void addParseableMimeTypes(final String configStr) { - parserConfig.addParseableMimeTypes(configStr); - } - - public static String[] setEnabledParserList(final Set mimeTypeSet) { - return parserConfig.setEnabledParserList(mimeTypeSet); - } - - public static boolean supportedFileExtContains(final String fileExt) { - return parserConfig.supportedFileExtContains(fileExt); - } - - public static boolean supportedMimeTypesContains(final String mimeType) { - return parserConfig.supportedMimeTypesContains(mimeType); - } - -} diff --git a/source/de/anomic/document/parser/bzipParser.java b/source/de/anomic/document/parser/bzipParser.java index 33de51192..8173e80be 100644 --- a/source/de/anomic/document/parser/bzipParser.java +++ b/source/de/anomic/document/parser/bzipParser.java @@ -35,14 +35,14 @@ import java.util.Hashtable; import org.apache.tools.bzip2.CBZip2InputStream; import de.anomic.document.AbstractParser; +import de.anomic.document.Idiom; import de.anomic.document.Parser; -import de.anomic.document.ParserDispatcher; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.kelondro.util.FileUtils; import de.anomic.yacy.yacyURL; -public class bzipParser extends AbstractParser implements Parser { +public class bzipParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class @@ -107,7 +107,7 @@ public class bzipParser extends AbstractParser implements Parser { checkInterruption(); // creating a new parser class to parse the unzipped content - return ParserDispatcher.parseSource(location,null,null,tempFile); + return Parser.parseSource(location,null,null,tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; diff --git a/source/de/anomic/document/parser/docParser.java b/source/de/anomic/document/parser/docParser.java index c76e96d65..ce4db99af 100644 --- a/source/de/anomic/document/parser/docParser.java +++ b/source/de/anomic/document/parser/docParser.java @@ -34,12 +34,12 @@ import org.textmining.extraction.TextExtractor; import org.textmining.extraction.word.WordTextExtractorFactory; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.yacy.yacyURL; -public class docParser extends AbstractParser implements Parser { +public class docParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class diff --git a/source/de/anomic/document/parser/gzipParser.java b/source/de/anomic/document/parser/gzipParser.java index 730ed7690..408bbff98 100644 --- a/source/de/anomic/document/parser/gzipParser.java +++ b/source/de/anomic/document/parser/gzipParser.java @@ -34,14 +34,14 @@ import java.util.Hashtable; import java.util.zip.GZIPInputStream; import de.anomic.document.AbstractParser; +import de.anomic.document.Idiom; import de.anomic.document.Parser; -import de.anomic.document.ParserDispatcher; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.kelondro.util.FileUtils; import de.anomic.yacy.yacyURL; -public class gzipParser extends AbstractParser implements Parser { +public class gzipParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class @@ -97,7 +97,7 @@ public class gzipParser extends AbstractParser implements Parser { checkInterruption(); // creating a new parser class to parse the unzipped content - return ParserDispatcher.parseSource(location,null,null,tempFile); + return Parser.parseSource(location,null,null,tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; diff --git a/source/de/anomic/document/parser/htmlParser.java b/source/de/anomic/document/parser/htmlParser.java index f441b8e21..2d83d09b6 100644 --- a/source/de/anomic/document/parser/htmlParser.java +++ b/source/de/anomic/document/parser/htmlParser.java @@ -1,3 +1,29 @@ +// htmlParser.java +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 09.07.2009 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $ +// $LastChangedRevision: 5736 $ +// $LastChangedBy: borg-0300 $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + package de.anomic.document.parser; import java.io.IOException; @@ -9,7 +35,7 @@ import java.util.Hashtable; import de.anomic.document.AbstractParser; import de.anomic.document.Document; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.parser.html.ContentScraper; import de.anomic.document.parser.html.ScraperInputStream; @@ -17,7 +43,7 @@ import de.anomic.document.parser.html.TransformerWriter; import de.anomic.kelondro.util.FileUtils; import de.anomic.yacy.yacyURL; -public class htmlParser extends AbstractParser implements Parser { +public class htmlParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class @@ -187,6 +213,7 @@ public class htmlParser extends AbstractParser implements Parser { return encoding; } + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; diff --git a/source/de/anomic/document/parser/mimeTypeParser.java b/source/de/anomic/document/parser/mimeTypeParser.java index 387d1cd7d..d36d72825 100644 --- a/source/de/anomic/document/parser/mimeTypeParser.java +++ b/source/de/anomic/document/parser/mimeTypeParser.java @@ -41,14 +41,14 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; import de.anomic.document.AbstractParser; +import de.anomic.document.Idiom; import de.anomic.document.Parser; -import de.anomic.document.ParserDispatcher; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.kelondro.util.FileUtils; import de.anomic.yacy.yacyURL; -public class mimeTypeParser extends AbstractParser implements Parser { +public class mimeTypeParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class @@ -140,7 +140,7 @@ public class mimeTypeParser extends AbstractParser implements Parser { checkInterruption(); // parsing the content using the determined mimetype - return ParserDispatcher.parseSource(location,mimeType,charset,sourceFile); + return Parser.parseSource(location,mimeType,charset,sourceFile); } throw new ParserException("Unable to detect mimetype of resource (3).",location); } catch (final MagicMatchNotFoundException e) { diff --git a/source/de/anomic/document/parser/odtParser.java b/source/de/anomic/document/parser/odtParser.java index 289e8d397..0f43be1a9 100644 --- a/source/de/anomic/document/parser/odtParser.java +++ b/source/de/anomic/document/parser/odtParser.java @@ -47,7 +47,7 @@ import com.catcode.odf.OpenDocumentTextInputStream; import de.anomic.crawler.HTTPLoader; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.http.httpClient; @@ -57,7 +57,7 @@ import de.anomic.kelondro.util.FileUtils; import de.anomic.server.serverCharBuffer; import de.anomic.yacy.yacyURL; -public class odtParser extends AbstractParser implements Parser { +public class odtParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class @@ -109,7 +109,7 @@ public class odtParser extends AbstractParser implements Parser { final long contentSize = zipEntry.getSize(); // creating a writer for output - if ((contentSize == -1) || (contentSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + if ((contentSize == -1) || (contentSize > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { writerFile = File.createTempFile("odtParser",".prt"); writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); } else { diff --git a/source/de/anomic/document/parser/pdfParser.java b/source/de/anomic/document/parser/pdfParser.java index ce54e79da..af76bd98a 100644 --- a/source/de/anomic/document/parser/pdfParser.java +++ b/source/de/anomic/document/parser/pdfParser.java @@ -43,14 +43,14 @@ import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; import org.pdfbox.util.PDFTextStripper; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.kelondro.util.FileUtils; import de.anomic.server.serverCharBuffer; import de.anomic.yacy.yacyURL; -public class pdfParser extends AbstractParser implements Parser { +public class pdfParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class @@ -122,7 +122,7 @@ public class pdfParser extends AbstractParser implements Parser { } // creating a writer for output - if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { writerFile = File.createTempFile("pdfParser",".prt"); writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); } else { diff --git a/source/de/anomic/document/parser/pptParser.java b/source/de/anomic/document/parser/pptParser.java index 59ca84ae1..16676329f 100644 --- a/source/de/anomic/document/parser/pptParser.java +++ b/source/de/anomic/document/parser/pptParser.java @@ -34,12 +34,12 @@ import java.util.Hashtable; import org.apache.poi.hslf.extractor.PowerPointExtractor; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.yacy.yacyURL; -public class pptParser extends AbstractParser implements Parser { +public class pptParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class diff --git a/source/de/anomic/document/parser/psParser.java b/source/de/anomic/document/parser/psParser.java index cd84998f3..b7a60a405 100644 --- a/source/de/anomic/document/parser/psParser.java +++ b/source/de/anomic/document/parser/psParser.java @@ -37,13 +37,13 @@ import java.io.InputStreamReader; import java.util.Hashtable; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.kelondro.util.FileUtils; import de.anomic.yacy.yacyURL; -public class psParser extends AbstractParser implements Parser { +public class psParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class diff --git a/source/de/anomic/document/parser/rpmParser.java b/source/de/anomic/document/parser/rpmParser.java index 6039cd23e..79dba7936 100644 --- a/source/de/anomic/document/parser/rpmParser.java +++ b/source/de/anomic/document/parser/rpmParser.java @@ -38,7 +38,7 @@ import com.jguild.jrpm.io.datatype.DataTypeIf; import de.anomic.crawler.HTTPLoader; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.http.httpClient; @@ -51,7 +51,7 @@ import de.anomic.yacy.yacyURL; * @author theli * */ -public class rpmParser extends AbstractParser implements Parser { +public class rpmParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class diff --git a/source/de/anomic/document/parser/rssParser.java b/source/de/anomic/document/parser/rssParser.java index 9cacdcc40..aadf35034 100644 --- a/source/de/anomic/document/parser/rssParser.java +++ b/source/de/anomic/document/parser/rssParser.java @@ -39,7 +39,7 @@ import java.util.Map; import de.anomic.content.RSSMessage; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.document.parser.html.AbstractScraper; @@ -53,7 +53,7 @@ import de.anomic.kelondro.util.FileUtils; import de.anomic.server.serverCharBuffer; import de.anomic.yacy.yacyURL; -public class rssParser extends AbstractParser implements Parser { +public class rssParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class diff --git a/source/de/anomic/document/parser/rtfParser.java b/source/de/anomic/document/parser/rtfParser.java index b8d0e83a3..3a48ca4c3 100644 --- a/source/de/anomic/document/parser/rtfParser.java +++ b/source/de/anomic/document/parser/rtfParser.java @@ -34,12 +34,12 @@ import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.yacy.yacyURL; -public class rtfParser extends AbstractParser implements Parser { +public class rtfParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class diff --git a/source/de/anomic/document/parser/sevenzipParser.java b/source/de/anomic/document/parser/sevenzipParser.java index 674a7b63b..db2159ece 100644 --- a/source/de/anomic/document/parser/sevenzipParser.java +++ b/source/de/anomic/document/parser/sevenzipParser.java @@ -41,8 +41,9 @@ import SevenZip.Archive.IInArchive; import SevenZip.Archive.SevenZipEntry; import SevenZip.Archive.SevenZip.Handler; import de.anomic.document.AbstractParser; +import de.anomic.document.Classification; +import de.anomic.document.Idiom; import de.anomic.document.Parser; -import de.anomic.document.ParserDispatcher; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.kelondro.util.FileUtils; @@ -50,7 +51,7 @@ import de.anomic.server.serverCachedFileOutputStream; import de.anomic.yacy.yacyURL; import de.anomic.yacy.logging.Log; -public class sevenzipParser extends AbstractParser implements Parser { +public class sevenzipParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class @@ -99,14 +100,14 @@ public class sevenzipParser extends AbstractParser implements Parser { @Override public Document parse(final yacyURL location, final String mimeType, final String charset, final byte[] source) throws ParserException, InterruptedException { - return parse(location, mimeType, charset, new ByteArrayIInStream(source), Parser.MAX_KEEP_IN_MEMORY_SIZE - source.length); + return parse(location, mimeType, charset, new ByteArrayIInStream(source), Idiom.MAX_KEEP_IN_MEMORY_SIZE - source.length); } @Override public Document parse(final yacyURL location, final String mimeType, final String charset, final File sourceFile) throws ParserException, InterruptedException { try { - return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Parser.MAX_KEEP_IN_MEMORY_SIZE); + return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Idiom.MAX_KEEP_IN_MEMORY_SIZE); } catch (final IOException e) { throw new ParserException("error processing 7zip archive", location, e); } @@ -115,7 +116,7 @@ public class sevenzipParser extends AbstractParser implements Parser { public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { - final serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE); + final serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); FileUtils.copy(source, cfos); if (cfos.isFallback()) { return parse(location, mimeType, charset, cfos.getContentFile()); @@ -189,11 +190,11 @@ public class sevenzipParser extends AbstractParser implements Parser { // workaround for relative links in file, normally '#' shall be used behind the location, see // below for reversion of the effects final yacyURL url = yacyURL.newURL(doc.dc_source(), this.prefix + "/" + super.filePath); - final String mime = ParserDispatcher.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); + final String mime = Classification.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); if (this.cfos.isFallback()) { - theDoc = ParserDispatcher.parseSource(url, mime, null, this.cfos.getContentFile()); + theDoc = Parser.parseSource(url, mime, null, this.cfos.getContentFile()); } else { - theDoc = ParserDispatcher.parseSource(url, mime, null, this.cfos.getContentBAOS()); + theDoc = Parser.parseSource(url, mime, null, this.cfos.getContentBAOS()); } this.doc.addSubDocument(theDoc); diff --git a/source/de/anomic/document/parser/swfParser.java b/source/de/anomic/document/parser/swfParser.java index 688c53387..d80bbdd0a 100644 --- a/source/de/anomic/document/parser/swfParser.java +++ b/source/de/anomic/document/parser/swfParser.java @@ -33,12 +33,12 @@ import java.util.Hashtable; import pt.tumba.parser.swf.SWF2HTML; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.yacy.yacyURL; -public class swfParser extends AbstractParser implements Parser { +public class swfParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class diff --git a/source/de/anomic/document/parser/tarParser.java b/source/de/anomic/document/parser/tarParser.java index bc06a4066..739d9e662 100644 --- a/source/de/anomic/document/parser/tarParser.java +++ b/source/de/anomic/document/parser/tarParser.java @@ -43,8 +43,9 @@ import com.ice.tar.TarEntry; import com.ice.tar.TarInputStream; import de.anomic.document.AbstractParser; +import de.anomic.document.Classification; +import de.anomic.document.Idiom; import de.anomic.document.Parser; -import de.anomic.document.ParserDispatcher; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.document.parser.html.ContentScraper; @@ -53,7 +54,7 @@ import de.anomic.kelondro.util.ByteBuffer; import de.anomic.kelondro.util.FileUtils; import de.anomic.yacy.yacyURL; -public class tarParser extends AbstractParser implements Parser { +public class tarParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class @@ -85,7 +86,7 @@ public class tarParser extends AbstractParser implements Parser { File outputFile = null; Document subDoc = null; try { - if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { outputFile = File.createTempFile("zipParser",".prt"); docText = new BufferedOutputStream(new FileOutputStream(outputFile)); } else { @@ -96,7 +97,7 @@ public class tarParser extends AbstractParser implements Parser { * If the mimeType was not reported correcly by the webserve we * have to decompress it first */ - final String ext = ParserDispatcher.getFileExt(location).toLowerCase(); + final String ext = Classification.getFileExt(location).toLowerCase(); if (ext.equals("gz") || ext.equals("tgz")) { source = new GZIPInputStream(source); } @@ -129,7 +130,7 @@ public class tarParser extends AbstractParser implements Parser { final String entryExt = (idx > -1) ? entryName.substring(idx+1) : ""; // trying to determine the mimeType per file extension - final String entryMime = ParserDispatcher.getMimeTypeByFileExt(entryExt); + final String entryMime = Classification.getMimeTypeByFileExt(entryExt); // getting the entry content File subDocTempFile = null; @@ -144,7 +145,7 @@ public class tarParser extends AbstractParser implements Parser { checkInterruption(); // parsing the content - subDoc = ParserDispatcher.parseSource(yacyURL.newURL(location,"#" + entryName),entryMime,null,subDocTempFile); + subDoc = Parser.parseSource(yacyURL.newURL(location,"#" + entryName),entryMime,null,subDocTempFile); } catch (final ParserException e) { this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage()); } finally { diff --git a/source/de/anomic/document/parser/vcfParser.java b/source/de/anomic/document/parser/vcfParser.java index c7f89c978..b172ad523 100644 --- a/source/de/anomic/document/parser/vcfParser.java +++ b/source/de/anomic/document/parser/vcfParser.java @@ -39,7 +39,7 @@ import java.util.LinkedList; import de.anomic.crawler.HTTPLoader; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.http.httpClient; @@ -53,7 +53,7 @@ import de.anomic.yacy.yacyURL; * @author theli * */ -public class vcfParser extends AbstractParser implements Parser { +public class vcfParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class diff --git a/source/de/anomic/document/parser/vsdParser.java b/source/de/anomic/document/parser/vsdParser.java index 01ac2c32e..a3eb94fd1 100644 --- a/source/de/anomic/document/parser/vsdParser.java +++ b/source/de/anomic/document/parser/vsdParser.java @@ -31,14 +31,14 @@ import java.io.InputStream; import java.util.Hashtable; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.yacy.yacyURL; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpsf.SummaryInformation; -public class vsdParser extends AbstractParser implements Parser { +public class vsdParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class diff --git a/source/de/anomic/document/parser/xlsParser.java b/source/de/anomic/document/parser/xlsParser.java index afb73c463..97634b0f0 100644 --- a/source/de/anomic/document/parser/xlsParser.java +++ b/source/de/anomic/document/parser/xlsParser.java @@ -40,12 +40,12 @@ import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import de.anomic.document.AbstractParser; -import de.anomic.document.Parser; +import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.yacy.yacyURL; -public class xlsParser extends AbstractParser implements Parser, HSSFListener { +public class xlsParser extends AbstractParser implements Idiom, HSSFListener { //StringBuilder for parsed text private StringBuilder sbFoundStrings = null; diff --git a/source/de/anomic/document/parser/zipParser.java b/source/de/anomic/document/parser/zipParser.java index 59cbe7b8a..7b08f58ec 100644 --- a/source/de/anomic/document/parser/zipParser.java +++ b/source/de/anomic/document/parser/zipParser.java @@ -41,8 +41,9 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import de.anomic.document.AbstractParser; +import de.anomic.document.Classification; +import de.anomic.document.Idiom; import de.anomic.document.Parser; -import de.anomic.document.ParserDispatcher; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.document.parser.html.ContentScraper; @@ -51,7 +52,7 @@ import de.anomic.kelondro.util.ByteBuffer; import de.anomic.kelondro.util.FileUtils; import de.anomic.yacy.yacyURL; -public class zipParser extends AbstractParser implements Parser { +public class zipParser extends AbstractParser implements Idiom { /** * a list of mime types that are supported by this parser class @@ -85,7 +86,7 @@ public class zipParser extends AbstractParser implements Parser { File outputFile = null; Document subDoc = null; try { - if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) { + if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { outputFile = File.createTempFile("zipParser",".prt"); docText = new BufferedOutputStream(new FileOutputStream(outputFile)); } else { @@ -117,7 +118,7 @@ public class zipParser extends AbstractParser implements Parser { final String entryExt = (idx > -1) ? entryName.substring(idx+1) : ""; // trying to determine the mimeType per file extension - final String entryMime = ParserDispatcher.getMimeTypeByFileExt(entryExt); + final String entryMime = Classification.getMimeTypeByFileExt(entryExt); // parsing the content File subDocTempFile = null; @@ -129,7 +130,7 @@ public class zipParser extends AbstractParser implements Parser { FileUtils.copy(zippedContent,subDocTempFile,entry.getSize()); // parsing the zip file entry - subDoc = ParserDispatcher.parseSource(yacyURL.newURL(location,"#" + entryName),entryMime,null, subDocTempFile); + subDoc = Parser.parseSource(yacyURL.newURL(location,"#" + entryName),entryMime,null, subDocTempFile); } catch (final ParserException e) { this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage()); } finally { diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index cf0d969d6..ad5df2dc4 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -80,7 +80,7 @@ import java.util.Properties; import java.util.concurrent.ConcurrentHashMap; import java.util.zip.GZIPOutputStream; -import de.anomic.document.ParserDispatcher; +import de.anomic.document.Classification; import de.anomic.document.parser.htmlParser; import de.anomic.document.parser.html.ContentScraper; import de.anomic.document.parser.html.ScraperInputStream; @@ -231,7 +231,7 @@ public final class httpdFileHandler { } headers.put(httpHeader.SERVER, "AnomicHTTPD (www.anomic.de)"); headers.put(httpHeader.DATE, DateFormatter.formatRFC1123(new Date())); - if(!(ParserDispatcher.mediaExtContains(ext))){ + if(!(Classification.mediaExtContains(ext))){ headers.put(httpHeader.PRAGMA, "no-cache"); } return headers; diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 59d7b3496..cb5e791c2 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -73,7 +73,7 @@ import java.util.zip.GZIPOutputStream; import de.anomic.crawler.HTTPLoader; import de.anomic.data.Blacklist; -import de.anomic.document.ParserDispatcher; +import de.anomic.document.Classification; import de.anomic.document.parser.html.ContentTransformer; import de.anomic.document.parser.html.Transformer; import de.anomic.kelondro.util.DateFormatter; @@ -528,7 +528,7 @@ public final class httpdProxyHandler { final String storeError = cacheEntry.shallStoreCacheForProxy(); final boolean storeHTCache = cacheEntry.profile().storeHTCache(); - final boolean isSupportedContent = ParserDispatcher.supportedContent(cacheEntry.url(), cacheEntry.getMimeType()); + final boolean isSupportedContent = Classification.supportedContent(cacheEntry.url(), cacheEntry.getMimeType()); if ( /* * Now we store the response into the htcache directory if diff --git a/source/de/anomic/kelondro/order/NaturalOrder.java b/source/de/anomic/kelondro/order/NaturalOrder.java index 99520e030..c15c2b51d 100644 --- a/source/de/anomic/kelondro/order/NaturalOrder.java +++ b/source/de/anomic/kelondro/order/NaturalOrder.java @@ -26,7 +26,6 @@ package de.anomic.kelondro.order; -import java.io.IOException; import java.util.Comparator; import java.util.Iterator; @@ -238,7 +237,7 @@ public final class NaturalOrder extends AbstractOrder implements ByteOrd return sb.toString(); } - public static Iterator LongIterator(Iterator b256Iterator) throws IOException { + public static Iterator LongIterator(Iterator b256Iterator) { return new LongIter(b256Iterator); } diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 20e1d1a71..2dc04f298 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -41,7 +41,7 @@ import java.io.InputStream; import java.util.HashMap; import java.util.Map; -import de.anomic.document.ParserDispatcher; +import de.anomic.document.Classification; import de.anomic.http.httpResponseHeader; import de.anomic.http.httpDocument; import de.anomic.kelondro.blob.ArrayStack; @@ -181,7 +181,7 @@ public final class plasmaHTCache { } public static boolean isText(final String mimeType) { - return ParserDispatcher.supportedMimeTypesContains(mimeType); + return Classification.supportedMimeTypesContains(mimeType); } public static boolean noIndexingURL(final yacyURL url) { @@ -200,7 +200,7 @@ public final class plasmaHTCache { //php - return ParserDispatcher.mediaExtContains(urlString); + return Classification.mediaExtContains(urlString); } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 1a01ff4e5..990e38457 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -144,7 +144,8 @@ import de.anomic.data.wiki.wikiBoard; import de.anomic.data.wiki.wikiCode; import de.anomic.data.wiki.wikiParser; import de.anomic.document.Condenser; -import de.anomic.document.ParserDispatcher; +import de.anomic.document.Classification; +import de.anomic.document.Parser; import de.anomic.document.ParserException; import de.anomic.document.Word; import de.anomic.document.Document; @@ -517,13 +518,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch= 0) && (ParserDispatcher.supportedFileExtContains(filename.substring(p + 1)))) + ((p >= 0) && (Classification.supportedFileExtContains(filename.substring(p + 1)))) ) { String supposedMime = "text/html"; // if the mimeType Parser is installed we can set the mimeType to null to force // a mimetype detection - if (ParserDispatcher.supportedMimeTypesContains("application/octet-stream")) { + if (Classification.supportedMimeTypesContains("application/octet-stream")) { supposedMime = null; } else if (p != -1){ // otherwise we try to determine the mimeType per file Extension - supposedMime = ParserDispatcher.getMimeTypeByFileExt(filename.substring(p + 1)); + supposedMime = Classification.getMimeTypeByFileExt(filename.substring(p + 1)); } - return ParserDispatcher.parseSource(url, supposedMime, null, contentLength, resourceStream); + return Parser.parseSource(url, supposedMime, null, contentLength, resourceStream); } return null; } - if (ParserDispatcher.supportedMimeTypesContains(responseHeader.mime())) { - return ParserDispatcher.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream); + if (Classification.supportedMimeTypesContains(responseHeader.mime())) { + return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream); } return null; } catch (final InterruptedException e) { diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java index 45e004ea1..8bad45c78 100644 --- a/source/de/anomic/tools/mediawikiIndex.java +++ b/source/de/anomic/tools/mediawikiIndex.java @@ -58,7 +58,8 @@ import java.util.concurrent.TimeoutException; import de.anomic.data.wiki.wikiCode; import de.anomic.data.wiki.wikiParser; -import de.anomic.document.ParserDispatcher; +import de.anomic.document.Classification; +import de.anomic.document.Parser; import de.anomic.document.ParserException; import de.anomic.document.Document; import de.anomic.kelondro.util.ByteBuffer; @@ -102,8 +103,8 @@ public class mediawikiIndex extends Thread { this.count = 0; this.start = 0; // must be called before usage: - ParserDispatcher.initHTMLParsableMimeTypes("text/html"); - ParserDispatcher.addParseableMimeTypes("text/html"); + Classification.initHTMLParsableMimeTypes("text/html"); + Classification.addParseableMimeTypes("text/html"); } /** @@ -145,8 +146,8 @@ public class mediawikiIndex extends Thread { StringBuilder sb = new StringBuilder(); boolean page = false, text = false; String title = null; - ParserDispatcher.initHTMLParsableMimeTypes("text/html"); - ParserDispatcher.addParseableMimeTypes("text/html"); + Classification.initHTMLParsableMimeTypes("text/html"); + Classification.addParseableMimeTypes("text/html"); wikiparserrecord poison = newRecord(); int threads = Math.max(2, Runtime.getRuntime().availableProcessors() - 1); BlockingQueue in = new ArrayBlockingQueue(threads * 10); @@ -487,7 +488,7 @@ public class mediawikiIndex extends Thread { public void genDocument() throws InterruptedException, ParserException { try { url = new yacyURL(urlStub + title, null); - document = ParserDispatcher.parseSource(url, "text/html", "utf-8", html.getBytes("UTF-8")); + document = Parser.parseSource(url, "text/html", "utf-8", html.getBytes("UTF-8")); // the wiki parser is not able to find the proper title in the source text, so it must be set here document.setTitle(title); } catch (UnsupportedEncodingException e) { diff --git a/source/de/anomic/yacy/dht/FlatWordPartitionScheme.java b/source/de/anomic/yacy/dht/FlatWordPartitionScheme.java index 83aa0da09..4be71a0d9 100755 --- a/source/de/anomic/yacy/dht/FlatWordPartitionScheme.java +++ b/source/de/anomic/yacy/dht/FlatWordPartitionScheme.java @@ -28,6 +28,12 @@ package de.anomic.yacy.dht; import de.anomic.kelondro.order.Base64Order; import de.anomic.yacy.yacySeed; +/** + * A flat word partition scheme is a metric for words on the range of a distributed + * hash table. The dht is reflected by a 0..Long.MAX_VALUE integer range, each word gets + * a number on that range. To compute a number, the hash representation is used to compute + * the hash position from the first 63 bits of the b64 hash string. + */ public class FlatWordPartitionScheme implements PartitionScheme { public static final FlatWordPartitionScheme std = new FlatWordPartitionScheme(); diff --git a/source/de/anomic/ymage/ymageChart.java b/source/de/anomic/ymage/ymageChart.java index 1d8de3253..96607c669 100644 --- a/source/de/anomic/ymage/ymageChart.java +++ b/source/de/anomic/ymage/ymageChart.java @@ -147,12 +147,11 @@ public class ymageChart extends ymageMatrix { public static void main(final String[] args) { System.setProperty("java.awt.headless", "true"); - final boolean invers = false; - final String bg = (invers) ? "000000" : "FFFFFF"; - final String fg = (invers) ? "FFFFFF" : "000000"; - final String scale = (invers) ? "333333" : "CCCCCC"; - final String green = (invers) ? "008800" : "008800"; - final String blue = (invers) ? "0000FF" : "0000FF"; + final String bg = "FFFFFF"; + final String fg = "000000"; + final String scale = "CCCCCC"; + final String green = "008800"; + final String blue = "0000FF"; final ymageChart ip = new ymageChart(660, 240, bg, fg, fg, 30, 30, 20, 20, "PEER PERFORMANCE GRAPH: PAGES/MINUTE and USED MEMORY", ""); ip.declareDimension(DIMENSION_BOTTOM, 60, 60, -600, fg, scale, "TIME/SECONDS"); //ip.declareDimension(DIMENSION_TOP, 10, 40, "000000", null, "count");