From 445e3a620f7571bdbe80ecf3b64a2255d9c45474 Mon Sep 17 00:00:00 2001 From: theli Date: Mon, 14 Nov 2005 10:19:40 +0000 Subject: [PATCH] *) Avoid rejecting of html content by the crawler when the file extension is not set properly git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1074 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/plasmaParser.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 563d666d3..1c9af0da2 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -258,6 +258,11 @@ public final class plasmaParser { } public static boolean supportedContent(URL url, String mimeType) { + // TODO: we need some exceptions here to index URLs like this + // http://www.musicabona.com/respighi/12668/cd/index.html.fr + if ((mimeType!=null)&&(mimeType.trim().equalsIgnoreCase("text/html"))) { + return supportedMimeTypesContains(mimeType); + } return supportedMimeTypesContains(mimeType) && supportedFileExt(url); } @@ -715,8 +720,10 @@ public final class plasmaParser { //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray()); //byte[] theText = document.getText(); //serverFileUtils.write(theText, out); - String[] sentences = document.getSentences(); - for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]); + if (document != null) { + String[] sentences = document.getSentences(); + if (sentences != null) for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]); + } } catch (Exception e) { e.printStackTrace(); }