bugfixed utf-8 decoding and parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@346 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · 712fe9ef18
parent 63f9570d3a
commit 712fe9ef18
9 changed files with 94 additions and 506 deletions
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -121,8 +121,9 @@ public class CacheAdmin_p {
                    else {
                        htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
                        OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
-                        plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
                        serverFileUtils.copy(file, os);
+                        os.flush();
+                        plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
                        info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>";
                        info += "<b>HREF:</b><br>" + formatAnchor(document.getHyperlinks()) + "<br>";
                        info += "<b>MEDIA:</b><br>" + formatAnchor(document.getMedialinks()) + "<br>";
@ -130,7 +131,7 @@ public class CacheAdmin_p {
                        info += "<b>TEXT:</b><br><span class=\"small\">" + new String(scraper.getText()) + "</span><br>";
                        info += "<b>LINES:</b><br><span class=\"small\">";
                        String[] sentences = document.getSentences();
-                        for (int i = 0; i < sentences.length; i++) info += sentences + "<br>";
+                        for (int i = 0; i < sentences.length; i++) info += sentences[i] + "<br>";
                        info += "</span><br>";
                    }
                } catch (Exception e) {
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -41,6 +41,7 @@
 package de.anomic.htmlFilter;

 import java.net.URL;
+import java.net.MalformedURLException;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
@ -178,4 +179,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
 	System.out.println("TEXT    :" + new String(text.getBytes()));
    }

+    
+    public static void main(String[] args) {
+	String test = "Nokia kürzt bei Forschung und Entwicklung";
+        try {
+            htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
+            scraper.scrapeText(test.getBytes());
+            System.out.println(new String(scraper.getText()));
+        } catch (MalformedURLException e) {}
+    }
+    
 }
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -149,8 +149,8 @@ public final class plasmaParser {
     * @see #initMediaExt(String)
     */
    static {
-		initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
-			"sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"));
+        initMediaExt(extString2extList("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
+        "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"));
        
        /* ===================================================
         * initializing the parser object pool
@ -383,21 +383,21 @@ public final class plasmaParser {
    
    private static void loadEnabledParserList() {
        // loading a list of availabe parser from file
-    	Properties prop = new Properties();
+        Properties prop = new Properties();
        BufferedInputStream bufferedIn = null;
-    	try {
-    	    prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("yacy.parser"))));
-    	} catch (IOException e) {
-    	    System.err.println("ERROR: yacy.parser not found in settings path");
-    	} finally {
+        try {
+            prop.load(bufferedIn = new BufferedInputStream(new FileInputStream(new File("yacy.parser"))));
+        } catch (IOException e) {
+            System.err.println("ERROR: yacy.parser not found in settings path");
+        } finally {
            if (bufferedIn != null) try{ bufferedIn.close(); }catch(Exception e){}
        }
-
+        
        // enable them ...
        setEnabledParserList(prop.keySet());
-	}
-
-	private static void loadAvailableParserList() {
+    }
+    
+    private static void loadAvailableParserList() {
        try {
            plasmaParser.availableParserList.clear();
            
@ -405,24 +405,24 @@ public final class plasmaParser {
            String javaClassPath = System.getProperty("java.class.path");
            
            // getting the current package name
-			String plasmaParserPkgName = plasmaParser.class.getPackage().getName() + ".parser";
+            String plasmaParserPkgName = plasmaParser.class.getPackage().getName() + ".parser";
            serverLog.logInfo("PARSER","Searching for additional content parsers in package " + plasmaParserPkgName);
- 
+            
            // getting an uri to the parser subpackage
-	        String packageURI = plasmaParser.class.getResource("/"+plasmaParserPkgName.replace('.','/')).toString();
-			serverLog.logDebug("PARSER", "Parser directory is " + packageURI);           
- 
+            String packageURI = plasmaParser.class.getResource("/"+plasmaParserPkgName.replace('.','/')).toString();
+            serverLog.logDebug("PARSER", "Parser directory is " + packageURI);
+            
            // open the parser directory
-	        File parserDir = new File(new URI(packageURI));
+            File parserDir = new File(new URI(packageURI));
            if ((parserDir == null) || (!parserDir.exists()) || (!parserDir.isDirectory())) return;
            
-            /* 
-             * loop through all subdirectories and test if we can 
+            /*
+             * loop through all subdirectories and test if we can
             * find an additional parser class
             */
            File[] parserDirectories = parserDir.listFiles(parserDirectoryFilter);
            if (parserDirectories == null) return;
-			for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) {
+            for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) {
                File currentDir = parserDirectories[parserDirNr];
                serverLog.logDebug("PARSER", "Searching in directory " + currentDir.toString());
                String[] parserClasses = currentDir.list(parserFileNameFilter);
@ -432,7 +432,7 @@ public final class plasmaParser {
                    serverLog.logDebug("PARSER", "Testing parser class " + parserClasses[parserNr]);
                    String className = parserClasses[parserNr].substring(0,parserClasses[parserNr].indexOf(".class"));
                    String fullClassName = plasmaParserPkgName + "." + currentDir.getName() + "." + className;
-	                try {
+                    try {
                        // trying to load the parser class by its name
                        Class parserClass = Class.forName(fullClassName);
                        Object theParser = parserClass.newInstance();
@ -446,7 +446,7 @@ public final class plasmaParser {
                                    throw new ParserException("Missing dependency detected: '" + neededLibx[libxId] + "'.");
                                }
                            }
-                        }                        
+                        }
                        
                        // loading the list of mime-types that are supported by this parser class
                        Hashtable supportedMimeTypes = ((Parser)theParser).getSupportedMimeTypes();
@ -456,31 +456,31 @@ public final class plasmaParser {
                            availableParserList.put(mimeType,fullClassName);
                            serverLog.logInfo("PARSER", "Found functional parser for mimeType '" + mimeType + "'.");
                        }
-                            
-	                } catch (Exception e) { /* we can ignore this for the moment */ 
+                        
+                    } catch (Exception e) { /* we can ignore this for the moment */
                        serverLog.logWarning("PARSER", "Parser '" + className + "' doesn't work correctly and will be ignored.\n [" + e.getClass().getName() + "]: " + e.getMessage());
-                    } catch (Error e) { /* we can ignore this for the moment */ 
+                    } catch (Error e) { /* we can ignore this for the moment */
                        serverLog.logWarning("PARSER", "Parser '" + className + "' doesn't work correctly and will be ignored.\n [" + e.getClass().getName() + "]: " + e.getMessage());
                    }
                }
-			}
+            }
            
        } catch (Exception e) {
            serverLog.logError("PARSER", "Unable to determine all installed parsers. " + e.getMessage());
-        }		
-	}
-
-	public void close() {        
+        }
+    }
+    
+    public void close() {
        // clearing the parser list
        synchronized (this.enabledParserList) {
-	        this.enabledParserList.clear();
-		}
+            this.enabledParserList.clear();
+        }
        
        // closing the parser object pool
-        try {        
-	        this.theParserPool.close();
-        } catch (Exception e) { }        
-    }    
+        try {
+            this.theParserPool.close();
+        } catch (Exception e) { }
+    }
    
    public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) {
        
@ -498,7 +498,6 @@ public final class plasmaParser {
                // ... otherwise we make a html scraper and transformer
                htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
                OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
-    
                hfos.write(source);
                hfos.close();
                return transformScraper(location, mimeType, scraper);
@ -660,22 +659,24 @@ public final class plasmaParser {
        return v;
    }
    
-    public static void main(String[] args) {		
-	//javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
-	//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
-		try {
-			File in = new File(args[0]);
-			File out = new File(args[1]);
-			plasmaParser theParser = new plasmaParser();
+    public static void main(String[] args) {
+        //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
+        //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
+        try {
+            File in = new File(args[0]);
+            //File out = new File(args[1]);
+            plasmaParser theParser = new plasmaParser();
            theParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
            theParser.initParseableMimeTypes("application/atom+xml,application/gzip,application/java-archive,application/msword,application/octet-stream,application/pdf,application/rdf+xml,application/rss+xml,application/rtf,application/x-gzip,application/x-tar,application/xml,application/zip,text/rss,text/rtf,text/xml,application/x-bzip2");
-			FileInputStream theInput = new FileInputStream(in);
-			ByteArrayOutputStream theOutput = new ByteArrayOutputStream();
-			serverFileUtils.copy(theInput, theOutput);
-			plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray());
-			//plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
-			byte[] theText = document.getText();
-			serverFileUtils.write(theText, out);
+            FileInputStream theInput = new FileInputStream(in);
+            ByteArrayOutputStream theOutput = new ByteArrayOutputStream();
+            serverFileUtils.copy(theInput, theOutput);
+            plasmaParserDocument document = theParser.parseSource(new URL("http://brain/~theli/test.pdf"), null, theOutput.toByteArray());
+            //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray());
+            //byte[] theText = document.getText();
+            //serverFileUtils.write(theText, out);
+            String[] sentences = document.getSentences();
+            for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
        } catch (Exception e) {
            e.printStackTrace();
        }
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -105,6 +105,7 @@ public class plasmaSnippetCache {
    }
    
    public result retrieve(URL url, Set queryhashes, boolean fetchOnline) {
+        // heise = "0OQUNU3JSs05"
        if (queryhashes.size() == 0) {
            //System.out.println("found no queryhashes for url retrieve " + url);
            return new result(null, SOURCE_ERROR, "no query hashes given");
@ -250,7 +251,7 @@ public class plasmaSnippetCache {
        } catch (IOException e) {}
        
        if (header == null) {
-            String filename = url.getFile();
+            String filename = cacheManager.getCachePath(url).getName();
            int p = filename.lastIndexOf('.');
            if ((p < 0) ||
                ((p >= 0) && (plasmaParser.supportedFileExtContains(filename.substring(p + 1))))) {
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -371,6 +371,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            deployThread("99_indexcachemigration", "index cache migration", "migration of index cache data structures 0.37 -> 0.38",
            new serverInstantThread(classicCache, "oneStepMigration", "size"), 30000);
        }
+        
+        // test routine for snippet fetch
+        // url = /www.heise.de/mobil/newsticker/meldung/mail/54980
+        Set query = new HashSet(); query.add("0OQUNU3JSs05"); // 'heise'
+        //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true);
+        plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
    }
    
    private static String ppRamString(int bytes) {
--- a/source/de/anomic/server/serverFileUtils.java
+++ b/source/de/anomic/server/serverFileUtils.java
@ -90,12 +90,12 @@ public final class serverFileUtils {
        FileInputStream fis = null;
        FileOutputStream fos = null;
        try {
-	        fis = new FileInputStream(source);
-	        fos = new FileOutputStream(dest);
-	        copy(fis, fos);
+            fis = new FileInputStream(source);
+            fos = new FileOutputStream(dest);
+            copy(fis, fos);
        } finally {
            if (fis != null) try {fis.close();} catch (Exception e) {}
-            if (fos != null) try {fos.close();} catch (Exception e) {}            
+            if (fos != null) try {fos.close();} catch (Exception e) {}
        }
    }

@ -107,16 +107,16 @@ public final class serverFileUtils {
    }
    
    public static byte[] read(File source) throws IOException {
-		byte[] buffer = new byte[(int) source.length()];
-		InputStream fis = null;
-		try {
-			fis = new FileInputStream(source);
-			int p = 0, c;
-			while ((c = fis.read(buffer, p, buffer.length - p)) > 0) p += c;
-		} finally {
+        byte[] buffer = new byte[(int) source.length()];
+        InputStream fis = null;
+        try {
+            fis = new FileInputStream(source);
+            int p = 0, c;
+            while ((c = fis.read(buffer, p, buffer.length - p)) > 0) p += c;
+        } finally {
            if (fis != null) try { fis.close(); } catch (Exception e) {}
-		}
-		return buffer;
+        }
+        return buffer;
    }
    
    public static byte[] readAndZip(File source) throws IOException {
--- a/source/yacy.java
+++ b/source/yacy.java
@ -148,6 +148,10 @@ public final class yacy {
            
            plasmaSwitchboard sb = new plasmaSwitchboard(homePath, "yacy.init", "DATA/SETTINGS/httpProxy.conf");
            
+            // hardcoded, forced, temporary value-migration
+            sb.setConfig("htTemplatePath", "htroot/env/templates");
+            sb.setConfig("parseableExt", "html,htm,txt,php,shtml,asp");
+            
            // if we are running an SVN version, we try to detect the used svn revision now ...
            if (vString.equals("@" + "REPL_VERSION" + "@")) {
                Properties buildProp = new Properties();
@ -188,9 +192,6 @@ public final class yacy {
            if (timeout < 60000) timeout = 60000;
            int maxSessions   = Integer.parseInt(sb.getConfig("httpdMaxSessions", "100"));
            
-            // hardcoded, forced, temporary value-migration
-            sb.setConfig("htTemplatePath", "htroot/env/templates");
-            
            // create some directories
            File htRootPath = new File(sb.getRootPath(), sb.getConfig("htRootPath", "htroot"));
            File htDocsPath = new File(sb.getRootPath(), sb.getConfig("htDocsPath", "DATA/HTDOCS"));
--- a/yacy.init
+++ b/yacy.init
@ -100,7 +100,7 @@ parseableMimeTypes=
 # this is important to recognize <a href> - tags as not-html reference
 # These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
 mediaExt=ace,arj,asf,avi,bin,bz2,css,deb,doc,dmg,gif,gz,hqx,img,iso,jar,jpe,jpg,jpeg,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,sit,swf,sxc,sxd,sxi,sxw,tar,tgz,torrent,wmv,xcf,xls,zip
-parseableExt=html,htm,txt
+parseableExt=html,htm,txt,php,shtml,asp

 # Promotion Strings
 # These strings appear in the Web Mask of the YACY search client