From 445e3a620f7571bdbe80ecf3b64a2255d9c45474 Mon Sep 17 00:00:00 2001
From: theli <theli@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Mon, 14 Nov 2005 10:19:40 +0000
Subject: [PATCH] *) Avoid rejecting of html content by the crawler when the
 file extension is not set properly

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1074 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 source/de/anomic/plasma/plasmaParser.java | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 563d666d3..1c9af0da2 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -258,6 +258,11 @@ public final class plasmaParser {
     }
     
     public static boolean supportedContent(URL url, String mimeType) {
+        // TODO: we need some exceptions here to index URLs like this
+        //       http://www.musicabona.com/respighi/12668/cd/index.html.fr
+        if ((mimeType!=null)&&(mimeType.trim().equalsIgnoreCase("text/html"))) {
+            return supportedMimeTypesContains(mimeType);
+        }
         return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
     }
     
@@ -715,8 +720,10 @@ public final class plasmaParser {
             //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
             //byte[] theText = document.getText();
             //serverFileUtils.write(theText, out);
-            String[] sentences = document.getSentences();
-            for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
+            if (document != null) {
+                String[] sentences = document.getSentences();
+                if (sentences != null) for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
+            }
         } catch (Exception e) {
             e.printStackTrace();
         }