From 610b01e1c373becd63c691dd351162ee6c79f406 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 1 Sep 2011 16:05:00 +0000 Subject: [PATCH] - added a 'add every media object linked in a html document as a new document' to the html parser. This causes that all image, app, video or audio file that is linked in a html file is added as document. In fact that means that parsing a single html document may cause that a number of documents is inserted into the search index. - some refactoring for mime type discovery git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7919 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../anomic/crawler/retrieval/FileLoader.java | 4 +- .../anomic/crawler/retrieval/SMBLoader.java | 4 +- source/de/anomic/data/MimeTable.java | 53 ----- .../anomic/http/server/HTTPDFileHandler.java | 13 +- source/de/anomic/search/MediaSnippet.java | 6 +- source/net/yacy/document/Classification.java | 53 +++-- .../net/yacy/document/parser/htmlParser.java | 205 ++++++++++++------ 7 files changed, 193 insertions(+), 145 deletions(-) delete mode 100644 source/de/anomic/data/MimeTable.java diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java index 1c3cc2be7..332f8abf3 100644 --- a/source/de/anomic/crawler/retrieval/FileLoader.java +++ b/source/de/anomic/crawler/retrieval/FileLoader.java @@ -33,12 +33,12 @@ import java.util.List; import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.crawler.CrawlProfile; -import de.anomic.data.MimeTable; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ftp.FTPClient; +import net.yacy.document.Classification; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -102,7 +102,7 @@ public class FileLoader { } // create response header - String mime = MimeTable.ext2mime(url.getFileExtension()); + String mime = Classification.ext2mime(url.getFileExtension()); ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index feb2a6e45..c08dff0b3 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -42,13 +42,13 @@ import jcifs.smb.SmbFileInputStream; import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.crawler.CrawlProfile; -import de.anomic.data.MimeTable; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ftp.FTPClient; +import net.yacy.document.Classification; import net.yacy.document.TextParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -122,7 +122,7 @@ public class SMBLoader { } // create response header - String mime = MimeTable.ext2mime(url.getFileExtension()); + String mime = Classification.ext2mime(url.getFileExtension()); ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); diff --git a/source/de/anomic/data/MimeTable.java b/source/de/anomic/data/MimeTable.java deleted file mode 100644 index b248f23bc..000000000 --- a/source/de/anomic/data/MimeTable.java +++ /dev/null @@ -1,53 +0,0 @@ -package de.anomic.data; - -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.util.Properties; - -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.kelondro.logging.Log; - -public class MimeTable { - - private static final Properties mimeTable = new Properties(); - - public static void init(final File mimeFile) { - if (mimeTable.isEmpty()) { - // load the mime table - BufferedInputStream mimeTableInputStream = null; - try { - mimeTableInputStream = new BufferedInputStream(new FileInputStream(mimeFile)); - mimeTable.load(mimeTableInputStream); - } catch (final Exception e) { - Log.logException(e); - } finally { - if (mimeTableInputStream != null) try { mimeTableInputStream.close(); } catch (final Exception e1) {} - } - } - } - - public static int size() { - return mimeTable.size(); - } - - public static boolean isEmpty() { - return mimeTable.isEmpty(); - } - - public static String ext2mime(final String ext) { - return mimeTable.getProperty(ext, "application/" + ext); - } - - public static String ext2mime(final String ext, final String dfltMime) { - return mimeTable.getProperty(ext, dfltMime); - } - - public static String url2mime(final MultiProtocolURI url, final String dfltMime) { - return ext2mime(url.getFileExtension(), dfltMime); - } - - public static String url2mime(final MultiProtocolURI url) { - return ext2mime(url.getFileExtension()); - } -} diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index 0ccf42dd7..1a06f05c7 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -105,7 +105,6 @@ import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; import net.yacy.visualization.RasterPlotter; -import de.anomic.data.MimeTable; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverClassLoader; @@ -153,11 +152,11 @@ public final class HTTPDFileHandler { if (switchboard == null) { switchboard = theSwitchboard; - if (MimeTable.isEmpty()) { + if (Classification.countMimes() == 0) { // load the mime table final String mimeTablePath = theSwitchboard.getConfig("mimeTable",""); Log.logConfig("HTTPDFiles", "Loading mime mapping file " + mimeTablePath); - MimeTable.init(new File(theSwitchboard.getAppPath(), mimeTablePath)); + Classification.init(new File(theSwitchboard.getAppPath(), mimeTablePath)); } // create default files array @@ -586,7 +585,7 @@ public final class HTTPDFileHandler { // send an image to client targetDate = new Date(System.currentTimeMillis()); nocache = true; - final String mimeType = MimeTable.ext2mime(targetExt, "text/html"); + final String mimeType = Classification.ext2mime(targetExt, "text/html"); final ByteBuffer result = RasterPlotter.exportImage(yp.getImage(), targetExt); // write the array to the client @@ -600,7 +599,7 @@ public final class HTTPDFileHandler { // send an image to client targetDate = new Date(System.currentTimeMillis()); nocache = true; - final String mimeType = MimeTable.ext2mime(targetExt, "text/html"); + final String mimeType = Classification.ext2mime(targetExt, "text/html"); final ByteBuffer result = yp.getImage(); // write the array to the client @@ -634,7 +633,7 @@ public final class HTTPDFileHandler { // send an image to client targetDate = new Date(System.currentTimeMillis()); nocache = true; - final String mimeType = MimeTable.ext2mime(targetExt, "text/html"); + final String mimeType = Classification.ext2mime(targetExt, "text/html"); // generate an byte array from the generated image int width = i.getWidth(null); if (width < 0) width = 96; // bad hack @@ -805,7 +804,7 @@ public final class HTTPDFileHandler { // we have found a file that can be written to the client // if this file uses templates, then we use the template // re-write - method to create an result - String mimeType = MimeTable.ext2mime(targetExt, "text/html"); + String mimeType = Classification.ext2mime(targetExt, "text/html"); String ext = (String) conProp.get("EXT"); if (ext == null) ext = ""; final boolean zipContent = requestHeader.acceptGzip() && HTTPDemon.shallTransportZipped("." + ext); if (path.endsWith("html") || diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java index 0b71c62f4..ca0b91dff 100644 --- a/source/de/anomic/search/MediaSnippet.java +++ b/source/de/anomic/search/MediaSnippet.java @@ -37,6 +37,7 @@ import java.util.TreeSet; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.services.federated.yacy.CacheStrategy; +import net.yacy.document.Classification; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.WordTokenizer; @@ -47,7 +48,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.ByteArray; -import de.anomic.data.MimeTable; public class MediaSnippet implements Comparable, Comparator { @@ -167,7 +167,7 @@ public class MediaSnippet implements Comparable, Comparator, Comparator videoExtSet = new HashSet(); private static final Set appsExtSet = new HashSet(); - private static final Properties ext2mime = new Properties(); static { - // load a list of extensions from file - BufferedInputStream bufferedIn = null; - File mimeFile = new File("defaults/httpd.mime"); - if (!mimeFile.exists()) mimeFile = new File("config/mime.properties"); - try { - ext2mime.load(bufferedIn = new BufferedInputStream(new FileInputStream(mimeFile))); - } catch (final IOException e) { - Log.logSevere("Classification", "httpd.mime not found in " + mimeFile.toString(), e); - } finally { - if (bufferedIn != null) try { - bufferedIn.close(); - } catch (final Exception e) {} - } final String apps = "7z,ace,arc,arj,apk,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip"; final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,sid,wav,wma"; @@ -102,4 +88,41 @@ public class Classification { return mimeType.toUpperCase().startsWith("IMAGE"); } + + private static final Properties mimeTable = new Properties(); + + public static void init(final File mimeFile) { + if (mimeTable.isEmpty()) { + // load the mime table + BufferedInputStream mimeTableInputStream = null; + try { + mimeTableInputStream = new BufferedInputStream(new FileInputStream(mimeFile)); + mimeTable.load(mimeTableInputStream); + } catch (final Exception e) { + Log.logException(e); + } finally { + if (mimeTableInputStream != null) try { mimeTableInputStream.close(); } catch (final Exception e1) {} + } + } + } + + public static int countMimes() { + return mimeTable.size(); + } + + public static String ext2mime(final String ext) { + return mimeTable.getProperty(ext, "application/" + ext); + } + + public static String ext2mime(final String ext, final String dfltMime) { + return mimeTable.getProperty(ext, dfltMime); + } + + public static String url2mime(final MultiProtocolURI url, final String dfltMime) { + return ext2mime(url.getFileExtension(), dfltMime); + } + + public static String url2mime(final MultiProtocolURI url) { + return ext2mime(url.getFileExtension()); + } } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index c610c2d63..f248ad99d 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -32,15 +32,20 @@ import java.net.MalformedURLException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; +import java.util.ArrayList; +import java.util.Map; +import java.util.Map.Entry; import java.util.regex.Pattern; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.document.AbstractParser; +import net.yacy.document.Classification; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ContentScraper; +import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ScraperInputStream; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.util.FileUtils; @@ -83,6 +88,133 @@ public class htmlParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("text/csv"); } + public Document[] parse( + final MultiProtocolURI location, + final String mimeType, + final String documentCharset, + final InputStream sourceStream) throws Parser.Failure, InterruptedException { + + try { + // first get a document from the parsed html + ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream); + Document document = transformScraper(location, mimeType, documentCharset, scraper); + + // then produce virtual documents for each of the link that is contained in the document! + ArrayList docs = new ArrayList(); + docs.add(document); + for (Map.Entry link: document.getApplinks().entrySet()) { + addLinkDocs(docs, "application", link.getKey(), link.getValue(), scraper); + } + for (Map.Entry link: document.getAudiolinks().entrySet()) { + addLinkDocs(docs, "audio", link.getKey(), link.getValue(), scraper); + } + for (Map.Entry link: document.getVideolinks().entrySet()) { + addLinkDocs(docs, "video", link.getKey(), link.getValue(), scraper); + } + for (Entry link: document.getImages().entrySet()) { + addImageDocs(docs, link.getValue()); + } + + + // finally return the list of documents + return docs.toArray(new Document[docs.size()]); + } catch (final IOException e) { + throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location); + } + } + + private final static void addLinkDocs(ArrayList docs, String type, MultiProtocolURI uri, String descr, ContentScraper scraper) { + //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr); + final Document doc = new Document( + uri, + Classification.ext2mime(uri.getFileExtension()), + "UTF-8", + null, + scraper.getContentLanguages(), + null, + descr, + "", + "", + new String[]{descr}, + type, + 0.0f, 0.0f, + uri.toNormalform(false, false), + null, + null, + null, + false); + docs.add(doc); + } + + private final static void addImageDocs(ArrayList docs, ImageEntry img) { + //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt()); + final Document doc = new Document( + img.url(), + Classification.ext2mime(img.url().getFileExtension()), + "UTF-8", + null, + null, + null, + img.alt(), + "", + "", + new String[]{img.alt()}, + "image", + 0.0f, 0.0f, + img.url().toNormalform(false, false), + null, + null, + null, + false); + docs.add(doc); + } + + /** + * the transformScraper method transforms a scraper object into a document object + * @param location + * @param mimeType + * @param charSet + * @param scraper + * @return + */ + private static Document transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) { + final String[] sections = new String[ + scraper.getHeadlines(1).length + + scraper.getHeadlines(2).length + + scraper.getHeadlines(3).length + + scraper.getHeadlines(4).length + + scraper.getHeadlines(5).length + + scraper.getHeadlines(6).length]; + int p = 0; + for (int i = 1; i <= 6; i++) { + for (final String headline : scraper.getHeadlines(i)) { + sections[p++] = headline; + } + } + final Document ppd = new Document( + location, + mimeType, + charSet, + scraper, + scraper.getContentLanguages(), + scraper.getKeywords(), + scraper.getTitle(), + scraper.getAuthor(), + scraper.getPublisher(), + sections, + scraper.getDescription(), + scraper.getLon(), scraper.getLat(), + scraper.getText(), + scraper.getAnchors(), + scraper.getRSS(), + scraper.getImages(), + scraper.indexingDenied()); + //scraper.close(); + ppd.setFavicon(scraper.getFavicon()); + + return ppd; + } + public static ContentScraper parseToScraper( final MultiProtocolURI location, final String documentCharset, @@ -109,12 +241,12 @@ public class htmlParser extends AbstractParser implements Parser { // the author didn't tell us the encoding, try the mozilla-heuristic if (charset == null) { - final CharsetDetector det = new CharsetDetector(); - det.enableInputFilter(true); - final InputStream detStream = new BufferedInputStream(sourceStream); - det.setText(detStream); - charset = det.detect().getName(); - sourceStream = detStream; + final CharsetDetector det = new CharsetDetector(); + det.enableInputFilter(true); + final InputStream detStream = new BufferedInputStream(sourceStream); + det.setText(detStream); + charset = det.detect().getName(); + sourceStream = detStream; } // wtf? still nothing, just take system-standard @@ -124,11 +256,11 @@ public class htmlParser extends AbstractParser implements Parser { Charset c; try { - c = Charset.forName(charset); + c = Charset.forName(charset); } catch (final IllegalCharsetNameException e) { - c = Charset.defaultCharset(); + c = Charset.defaultCharset(); } catch (final UnsupportedCharsetException e) { - c = Charset.defaultCharset(); + c = Charset.defaultCharset(); } // parsing the content @@ -139,7 +271,7 @@ public class htmlParser extends AbstractParser implements Parser { } catch (final IOException e) { throw new Parser.Failure("IO error:" + e.getMessage(), location); } finally { - sourceStream.close(); + sourceStream.close(); writer.close(); } //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); @@ -152,59 +284,6 @@ public class htmlParser extends AbstractParser implements Parser { return scraper; } - public Document[] parse( - final MultiProtocolURI location, - final String mimeType, - final String documentCharset, - final InputStream sourceStream) throws Parser.Failure, InterruptedException { - - try { - return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream)); - } catch (final IOException e) { - throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location); - } - } - - private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) { - final String[] sections = new String[ - scraper.getHeadlines(1).length + - scraper.getHeadlines(2).length + - scraper.getHeadlines(3).length + - scraper.getHeadlines(4).length + - scraper.getHeadlines(5).length + - scraper.getHeadlines(6).length]; - int p = 0; - for (int i = 1; i <= 6; i++) { - for (final String headline : scraper.getHeadlines(i)) { - sections[p++] = headline; - } - } - final Document[] ppds = new Document[]{new Document( - location, - mimeType, - charSet, - scraper, - scraper.getContentLanguages(), - scraper.getKeywords(), - scraper.getTitle(), - scraper.getAuthor(), - scraper.getPublisher(), - sections, - scraper.getDescription(), - scraper.getLon(), scraper.getLat(), - scraper.getText(), - scraper.getAnchors(), - scraper.getRSS(), - scraper.getImages(), - scraper.indexingDenied())}; - //scraper.close(); - for (final Document ppd: ppds) { - ppd.setFavicon(scraper.getFavicon()); - } - return ppds; - } - - /** * some html authors use wrong encoding names, either because they don't know exactly what they * are doing or they produce a type. Many times, the upper/downcase scheme of the name is fuzzy