- added a 'add every media object linked in a html document as a new document' to the html parser. This causes that all image, app, video or audio file that is linked in a html file is added as document. In fact that means that parsing a single html document may cause that a number of documents is inserted into the search index.

- some refactoring for mime type discovery

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7919 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 9a8937f8b6
commit 610b01e1c3

@ -33,12 +33,12 @@ import java.util.List;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.document.Classification;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
@ -102,7 +102,7 @@ public class FileLoader {
}
// create response header
String mime = MimeTable.ext2mime(url.getFileExtension());
String mime = Classification.ext2mime(url.getFileExtension());
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);

@ -42,13 +42,13 @@ import jcifs.smb.SmbFileInputStream;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.document.Classification;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
@ -122,7 +122,7 @@ public class SMBLoader {
}
// create response header
String mime = MimeTable.ext2mime(url.getFileExtension());
String mime = Classification.ext2mime(url.getFileExtension());
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);

@ -1,53 +0,0 @@
package de.anomic.data;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.Properties;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.logging.Log;
public class MimeTable {
private static final Properties mimeTable = new Properties();
public static void init(final File mimeFile) {
if (mimeTable.isEmpty()) {
// load the mime table
BufferedInputStream mimeTableInputStream = null;
try {
mimeTableInputStream = new BufferedInputStream(new FileInputStream(mimeFile));
mimeTable.load(mimeTableInputStream);
} catch (final Exception e) {
Log.logException(e);
} finally {
if (mimeTableInputStream != null) try { mimeTableInputStream.close(); } catch (final Exception e1) {}
}
}
}
public static int size() {
return mimeTable.size();
}
public static boolean isEmpty() {
return mimeTable.isEmpty();
}
public static String ext2mime(final String ext) {
return mimeTable.getProperty(ext, "application/" + ext);
}
public static String ext2mime(final String ext, final String dfltMime) {
return mimeTable.getProperty(ext, dfltMime);
}
public static String url2mime(final MultiProtocolURI url, final String dfltMime) {
return ext2mime(url.getFileExtension(), dfltMime);
}
public static String url2mime(final MultiProtocolURI url) {
return ext2mime(url.getFileExtension());
}
}

@ -105,7 +105,6 @@ import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.visualization.RasterPlotter;
import de.anomic.data.MimeTable;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverClassLoader;
@ -153,11 +152,11 @@ public final class HTTPDFileHandler {
if (switchboard == null) {
switchboard = theSwitchboard;
if (MimeTable.isEmpty()) {
if (Classification.countMimes() == 0) {
// load the mime table
final String mimeTablePath = theSwitchboard.getConfig("mimeTable","");
Log.logConfig("HTTPDFiles", "Loading mime mapping file " + mimeTablePath);
MimeTable.init(new File(theSwitchboard.getAppPath(), mimeTablePath));
Classification.init(new File(theSwitchboard.getAppPath(), mimeTablePath));
}
// create default files array
@ -586,7 +585,7 @@ public final class HTTPDFileHandler {
// send an image to client
targetDate = new Date(System.currentTimeMillis());
nocache = true;
final String mimeType = MimeTable.ext2mime(targetExt, "text/html");
final String mimeType = Classification.ext2mime(targetExt, "text/html");
final ByteBuffer result = RasterPlotter.exportImage(yp.getImage(), targetExt);
// write the array to the client
@ -600,7 +599,7 @@ public final class HTTPDFileHandler {
// send an image to client
targetDate = new Date(System.currentTimeMillis());
nocache = true;
final String mimeType = MimeTable.ext2mime(targetExt, "text/html");
final String mimeType = Classification.ext2mime(targetExt, "text/html");
final ByteBuffer result = yp.getImage();
// write the array to the client
@ -634,7 +633,7 @@ public final class HTTPDFileHandler {
// send an image to client
targetDate = new Date(System.currentTimeMillis());
nocache = true;
final String mimeType = MimeTable.ext2mime(targetExt, "text/html");
final String mimeType = Classification.ext2mime(targetExt, "text/html");
// generate an byte array from the generated image
int width = i.getWidth(null); if (width < 0) width = 96; // bad hack
@ -805,7 +804,7 @@ public final class HTTPDFileHandler {
// we have found a file that can be written to the client
// if this file uses templates, then we use the template
// re-write - method to create an result
String mimeType = MimeTable.ext2mime(targetExt, "text/html");
String mimeType = Classification.ext2mime(targetExt, "text/html");
String ext = (String) conProp.get("EXT"); if (ext == null) ext = "";
final boolean zipContent = requestHeader.acceptGzip() && HTTPDemon.shallTransportZipped("." + ext);
if (path.endsWith("html") ||

@ -37,6 +37,7 @@ import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Classification;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.WordTokenizer;
@ -47,7 +48,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.ByteArray;
import de.anomic.data.MimeTable;
public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
@ -167,7 +167,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
final int ranking = removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() +
removeAppearanceHashes(desc, queryhashes).size();
if (ranking < 2 * queryhashes.size()) {
result.add(new MediaSnippet(mediatype, url, MimeTable.url2mime(url), desc, document.getTextLength(), null, ranking, source));
result.add(new MediaSnippet(mediatype, url, Classification.url2mime(url), desc, document.getTextLength(), null, ranking, source));
}
}
return result;
@ -196,7 +196,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() -
removeAppearanceHashes(desc, queryhashes).size();
final long ranking = Long.MAX_VALUE - (ientry.height() + 1) * (ientry.width() + 1) * (appcount + 1);
result.add(new MediaSnippet(ContentDomain.IMAGE, url, MimeTable.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, source));
result.add(new MediaSnippet(ContentDomain.IMAGE, url, Classification.url2mime(url), desc, ientry.fileSize(), ientry.width(), ientry.height(), ranking, source));
}
return result;
}

@ -23,11 +23,11 @@ package net.yacy.document;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.logging.Log;
public class Classification {
@ -38,22 +38,8 @@ public class Classification {
private static final Set<String> videoExtSet = new HashSet<String>();
private static final Set<String> appsExtSet = new HashSet<String>();
private static final Properties ext2mime = new Properties();
static {
// load a list of extensions from file
BufferedInputStream bufferedIn = null;
File mimeFile = new File("defaults/httpd.mime");
if (!mimeFile.exists()) mimeFile = new File("config/mime.properties");
try {
ext2mime.load(bufferedIn = new BufferedInputStream(new FileInputStream(mimeFile)));
} catch (final IOException e) {
Log.logSevere("Classification", "httpd.mime not found in " + mimeFile.toString(), e);
} finally {
if (bufferedIn != null) try {
bufferedIn.close();
} catch (final Exception e) {}
}
final String apps = "7z,ace,arc,arj,apk,asf,asx,bat,bin,bkf,bz2,cab,com,css,dcm,deb,dll,dmg,exe,gho,ghs,gz,hqx,img,iso,jar,lha,rar,sh,sit,sitx,tar,tbz,tgz,tib,torrent,vbs,war,zip";
final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,sid,wav,wma";
@ -102,4 +88,41 @@ public class Classification {
return mimeType.toUpperCase().startsWith("IMAGE");
}
private static final Properties mimeTable = new Properties();
public static void init(final File mimeFile) {
if (mimeTable.isEmpty()) {
// load the mime table
BufferedInputStream mimeTableInputStream = null;
try {
mimeTableInputStream = new BufferedInputStream(new FileInputStream(mimeFile));
mimeTable.load(mimeTableInputStream);
} catch (final Exception e) {
Log.logException(e);
} finally {
if (mimeTableInputStream != null) try { mimeTableInputStream.close(); } catch (final Exception e1) {}
}
}
}
public static int countMimes() {
return mimeTable.size();
}
public static String ext2mime(final String ext) {
return mimeTable.getProperty(ext, "application/" + ext);
}
public static String ext2mime(final String ext, final String dfltMime) {
return mimeTable.getProperty(ext, dfltMime);
}
public static String url2mime(final MultiProtocolURI url, final String dfltMime) {
return ext2mime(url.getFileExtension(), dfltMime);
}
public static String url2mime(final MultiProtocolURI url) {
return ext2mime(url.getFileExtension());
}
}

@ -32,15 +32,20 @@ import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.document.AbstractParser;
import net.yacy.document.Classification;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.util.FileUtils;
@ -83,6 +88,133 @@ public class htmlParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("text/csv");
}
public Document[] parse(
final MultiProtocolURI location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
try {
// first get a document from the parsed html
ContentScraper scraper = parseToScraper(location, documentCharset, sourceStream);
Document document = transformScraper(location, mimeType, documentCharset, scraper);
// then produce virtual documents for each of the link that is contained in the document!
ArrayList<Document> docs = new ArrayList<Document>();
docs.add(document);
for (Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
addLinkDocs(docs, "application", link.getKey(), link.getValue(), scraper);
}
for (Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
addLinkDocs(docs, "audio", link.getKey(), link.getValue(), scraper);
}
for (Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
addLinkDocs(docs, "video", link.getKey(), link.getValue(), scraper);
}
for (Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
addImageDocs(docs, link.getValue());
}
// finally return the list of documents
return docs.toArray(new Document[docs.size()]);
} catch (final IOException e) {
throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
}
}
private final static void addLinkDocs(ArrayList<Document> docs, String type, MultiProtocolURI uri, String descr, ContentScraper scraper) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
final Document doc = new Document(
uri,
Classification.ext2mime(uri.getFileExtension()),
"UTF-8",
null,
scraper.getContentLanguages(),
null,
descr,
"",
"",
new String[]{descr},
type,
0.0f, 0.0f,
uri.toNormalform(false, false),
null,
null,
null,
false);
docs.add(doc);
}
private final static void addImageDocs(ArrayList<Document> docs, ImageEntry img) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
final Document doc = new Document(
img.url(),
Classification.ext2mime(img.url().getFileExtension()),
"UTF-8",
null,
null,
null,
img.alt(),
"",
"",
new String[]{img.alt()},
"image",
0.0f, 0.0f,
img.url().toNormalform(false, false),
null,
null,
null,
false);
docs.add(doc);
}
/**
* the transformScraper method transforms a scraper object into a document object
* @param location
* @param mimeType
* @param charSet
* @param scraper
* @return
*/
private static Document transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
final String[] sections = new String[
scraper.getHeadlines(1).length +
scraper.getHeadlines(2).length +
scraper.getHeadlines(3).length +
scraper.getHeadlines(4).length +
scraper.getHeadlines(5).length +
scraper.getHeadlines(6).length];
int p = 0;
for (int i = 1; i <= 6; i++) {
for (final String headline : scraper.getHeadlines(i)) {
sections[p++] = headline;
}
}
final Document ppd = new Document(
location,
mimeType,
charSet,
scraper,
scraper.getContentLanguages(),
scraper.getKeywords(),
scraper.getTitle(),
scraper.getAuthor(),
scraper.getPublisher(),
sections,
scraper.getDescription(),
scraper.getLon(), scraper.getLat(),
scraper.getText(),
scraper.getAnchors(),
scraper.getRSS(),
scraper.getImages(),
scraper.indexingDenied());
//scraper.close();
ppd.setFavicon(scraper.getFavicon());
return ppd;
}
public static ContentScraper parseToScraper(
final MultiProtocolURI location,
final String documentCharset,
@ -109,12 +241,12 @@ public class htmlParser extends AbstractParser implements Parser {
// the author didn't tell us the encoding, try the mozilla-heuristic
if (charset == null) {
final CharsetDetector det = new CharsetDetector();
det.enableInputFilter(true);
final InputStream detStream = new BufferedInputStream(sourceStream);
det.setText(detStream);
charset = det.detect().getName();
sourceStream = detStream;
final CharsetDetector det = new CharsetDetector();
det.enableInputFilter(true);
final InputStream detStream = new BufferedInputStream(sourceStream);
det.setText(detStream);
charset = det.detect().getName();
sourceStream = detStream;
}
// wtf? still nothing, just take system-standard
@ -124,11 +256,11 @@ public class htmlParser extends AbstractParser implements Parser {
Charset c;
try {
c = Charset.forName(charset);
c = Charset.forName(charset);
} catch (final IllegalCharsetNameException e) {
c = Charset.defaultCharset();
c = Charset.defaultCharset();
} catch (final UnsupportedCharsetException e) {
c = Charset.defaultCharset();
c = Charset.defaultCharset();
}
// parsing the content
@ -139,7 +271,7 @@ public class htmlParser extends AbstractParser implements Parser {
} catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally {
sourceStream.close();
sourceStream.close();
writer.close();
}
//OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
@ -152,59 +284,6 @@ public class htmlParser extends AbstractParser implements Parser {
return scraper;
}
public Document[] parse(
final MultiProtocolURI location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
try {
return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
} catch (final IOException e) {
throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
}
}
private static Document[] transformScraper(final MultiProtocolURI location, final String mimeType, final String charSet, final ContentScraper scraper) {
final String[] sections = new String[
scraper.getHeadlines(1).length +
scraper.getHeadlines(2).length +
scraper.getHeadlines(3).length +
scraper.getHeadlines(4).length +
scraper.getHeadlines(5).length +
scraper.getHeadlines(6).length];
int p = 0;
for (int i = 1; i <= 6; i++) {
for (final String headline : scraper.getHeadlines(i)) {
sections[p++] = headline;
}
}
final Document[] ppds = new Document[]{new Document(
location,
mimeType,
charSet,
scraper,
scraper.getContentLanguages(),
scraper.getKeywords(),
scraper.getTitle(),
scraper.getAuthor(),
scraper.getPublisher(),
sections,
scraper.getDescription(),
scraper.getLon(), scraper.getLat(),
scraper.getText(),
scraper.getAnchors(),
scraper.getRSS(),
scraper.getImages(),
scraper.indexingDenied())};
//scraper.close();
for (final Document ppd: ppds) {
ppd.setFavicon(scraper.getFavicon());
}
return ppds;
}
/**
* some html authors use wrong encoding names, either because they don't know exactly what they
* are doing or they produce a type. Many times, the upper/downcase scheme of the name is fuzzy

Loading…
Cancel
Save