*) Avoid rejecting of html content by the crawler when the file extension is not set properly

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1074 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent e9d6defce6
commit 445e3a620f

@ -258,6 +258,11 @@ public final class plasmaParser {
}
public static boolean supportedContent(URL url, String mimeType) {
// TODO: we need some exceptions here to index URLs like this
// http://www.musicabona.com/respighi/12668/cd/index.html.fr
if ((mimeType!=null)&&(mimeType.trim().equalsIgnoreCase("text/html"))) {
return supportedMimeTypesContains(mimeType);
}
return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
}
@ -715,8 +720,10 @@ public final class plasmaParser {
//plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
//byte[] theText = document.getText();
//serverFileUtils.write(theText, out);
String[] sentences = document.getSentences();
for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
if (document != null) {
String[] sentences = document.getSentences();
if (sentences != null) for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
}
} catch (Exception e) {
e.printStackTrace();
}

Loading…
Cancel
Save