check filetype before loading (no more mp4 loading)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6200 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
lotus 16 years ago
parent b118bdd994
commit 9f083bb6b2

@ -43,7 +43,6 @@
<classpathentry kind="lib" path="lib/PDFBox-0.7.3.jar"/> <classpathentry kind="lib" path="lib/PDFBox-0.7.3.jar"/>
<classpathentry kind="lib" path="lib/poi-3.2-FINAL-20081019.jar"/> <classpathentry kind="lib" path="lib/poi-3.2-FINAL-20081019.jar"/>
<classpathentry kind="lib" path="lib/poi-scratchpad-3.2-FINAL-20081019.jar"/> <classpathentry kind="lib" path="lib/poi-scratchpad-3.2-FINAL-20081019.jar"/>
<classpathentry kind="lib" path="lib/tm-extractors-1.0.jar"/>
<classpathentry kind="lib" path="lib/webcat-0.1-swf.jar"/> <classpathentry kind="lib" path="lib/webcat-0.1-swf.jar"/>
<classpathentry kind="lib" path="lib/wsdl4j.jar"/> <classpathentry kind="lib" path="lib/wsdl4j.jar"/>
<classpathentry kind="output" path="gen"/> <classpathentry kind="output" path="gen"/>

@ -110,7 +110,7 @@
</dl> </dl>
</fieldset> </fieldset>
:: ::
You installed YaCy with a package manage. To update YaCy, use the packager manager:<p> You installed YaCy with a package manager. To update YaCy, use the package manager:<p>
Debian: apt-get update yacy Debian: apt-get update yacy
:: ::
#(/candeploy)# #(/candeploy)#

@ -119,6 +119,12 @@ public final class HTTPLoader {
final boolean ssl = entry.url().getProtocol().equals("https"); final boolean ssl = entry.url().getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443 : 80; if (port < 0) port = (ssl) ? 443 : 80;
// if not the right file type then reject file
if (!Parser.supportsExtension(entry.url())) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong extension");
throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString());
}
// check if url is in blacklist // check if url is in blacklist
final String hostlow = host.toLowerCase(); final String hostlow = host.toLowerCase();
if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) { if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
@ -156,11 +162,7 @@ public final class HTTPLoader {
// request has been placed and result has been returned. work off response // request has been placed and result has been returned. work off response
//try { //try {
if (!Parser.supportsExtension(entry.url())) { if (!Parser.supportsMime(res.getResponseHeader().mime())) {
// if the response has not the right file type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong extension");
throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString());
} else if (!Parser.supportsMime(res.getResponseHeader().mime())) {
// if the response has not the right file type then reject file // if the response has not the right file type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type"); sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type");
throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());

Loading…
Cancel
Save