using mime-type instead of file extension for doctype

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@269 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 8b89f433b1
commit 4f9c30ef49

@ -532,7 +532,8 @@ public final class plasmaHTCache {
lastModified = responseHeader.lastModified(); lastModified = responseHeader.lastModified();
if (lastModified == null) lastModified = new Date(); // does not exist in header if (lastModified == null) lastModified = new Date(); // does not exist in header
} }
this.doctype = plasmaWordIndexEntry.docType(nomalizedURLString); this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime());
if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url);
this.language = plasmaWordIndexEntry.language(url); this.language = plasmaWordIndexEntry.language(url);
// to be defined later: // to be defined later:

@ -50,6 +50,7 @@ import java.util.Properties;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
import de.anomic.htmlFilter.htmlFilterContentScraper;
public class plasmaWordIndexEntry { public class plasmaWordIndexEntry {
@ -88,6 +89,8 @@ public class plasmaWordIndexEntry {
public static final char DT_MOVIE = 'm'; public static final char DT_MOVIE = 'm';
public static final char DT_FLASH = 'f'; public static final char DT_FLASH = 'f';
public static final char DT_SHARE = 's'; public static final char DT_SHARE = 's';
public static final char DT_AUDIO = 'a';
public static final char DT_BINARY = 'b';
public static final char DT_UNKNOWN = 'u'; public static final char DT_UNKNOWN = 'u';
// local flag attributes // local flag attributes
@ -103,7 +106,8 @@ public class plasmaWordIndexEntry {
} }
// doctype calculation // doctype calculation
public static char docType(String path) { public static char docType(URL url) {
String path = htmlFilterContentScraper.urlNormalform(url);
char doctype = DT_UNKNOWN; char doctype = DT_UNKNOWN;
if (path.endsWith(".gif")) doctype = DT_IMAGE; if (path.endsWith(".gif")) doctype = DT_IMAGE;
if (path.endsWith(".jpg")) doctype = DT_IMAGE; if (path.endsWith(".jpg")) doctype = DT_IMAGE;
@ -125,6 +129,45 @@ public class plasmaWordIndexEntry {
return doctype; return doctype;
} }
public static char docType(String mime) {
char doctype = DT_UNKNOWN;
if (mime.endsWith("/jpeg")) doctype = DT_IMAGE;
if (mime.endsWith("/rtf")) doctype = DT_DOC;
if (mime.endsWith("/msword")) doctype = DT_DOC;
if (mime.endsWith("/mspowerpoint")) doctype = DT_DOC;
if (mime.endsWith("/postscript")) doctype = DT_PDFPS;
if (mime.endsWith("/pdf")) doctype = DT_PDFPS;
if (mime.endsWith("/octet-stream")) doctype = DT_BINARY;
if (mime.endsWith("/x-shockwave-flash")) doctype = DT_FLASH;
if (mime.startsWith("audio/")) doctype = DT_AUDIO;
if (mime.startsWith("video/")) doctype = DT_MOVIE;
if (mime.startsWith("text/")) doctype = DT_TEXT;
if (mime.startsWith("image/")) doctype = DT_IMAGE;
if (mime.endsWith("/html")) doctype = DT_HTML;
//bz2 = application/x-bzip2
//dvi = application/x-dvi
//gz = application/gzip
//hqx = application/mac-binhex40
//lha = application/x-lzh
//lzh = application/x-lzh
//pac = application/x-ns-proxy-autoconfig
//php = application/x-httpd-php
//phtml = application/x-httpd-php
//rss = application/xml
//tar = application/tar
//tex = application/x-tex
//tgz = application/tar
//torrent = application/x-bittorrent
//xhtml = application/xhtml+xml
//xla = application/msexcel
//xls = application/msexcel
//xsl = application/xml
//xml = application/xml
//Z = application/x-compress
//zip = application/zip
return doctype;
}
// language calculation // language calculation
public static String language(URL url) { public static String language(URL url) {
String host = url.getHost(); String host = url.getHost();

Loading…
Cancel
Save