- enhanced parser: collection of audio, video, image and application links

- enhanced condenser: better handling of utf-8 and pre-formatted texts git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3017 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · ceb9e3aa17
parent 984285bdd6
commit ceb9e3aa17
16 changed files with 263 additions and 203 deletions
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -124,11 +124,14 @@ public class CacheAdmin_p {
                    info.append("<b>TITLE:</b><br>").append(scraper.getTitle()).append("<br>").append("<br>")
                        .append("<b>SECTION HEADLINES:</b><br>").append(formatTitles(document.getSectionTitles())).append("<br>")
                        .append("<b>HREF:</b><br>").append(formatAnchor(document.getHyperlinks())).append("<br>")
-                        .append("<b>MEDIA:</b><br>").append(formatAnchor(document.getMedialinks())).append("<br>")
+                        .append("<b>IMAGE:</b><br>").append(formatAnchor(document.getImagelinks())).append("<br>")
+                        .append("<b>AUDIO:</b><br>").append(formatAnchor(document.getAudiolinks())).append("<br>")
+                        .append("<b>VIDEO:</b><br>").append(formatAnchor(document.getVideolinks())).append("<br>")
+                        .append("<b>APPS:</b><br>").append(formatAnchor(document.getApplinks())).append("<br>")
                        .append("<b>EMAIL:</b><br>").append(formatAnchor(document.getEmaillinks())).append("<br>")
                        .append("<b>TEXT:</b><br><span class=\"small\">").append(new String(scraper.getText())).append("</span><br>")
                        .append("<b>LINES:</b><br><span class=\"small\">");
-                    final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+                    final Enumeration sentences = document.getSentences(false);
                    if (sentences != null) while (sentences.hasMoreElements()) {
                        info.append((String) sentences.nextElement()).append("<br>");
                    }
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -57,6 +57,7 @@ import de.anomic.http.httpHeader;
 import de.anomic.http.httpc;
 import de.anomic.index.indexURLEntry;
 import de.anomic.net.URL;
+import de.anomic.plasma.plasmaCondenser;
 import de.anomic.plasma.plasmaHTCache;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.plasmaSwitchboard;
@ -99,7 +100,8 @@ public class ViewFile {
        URL url = null;
        String descr = "";
        int wordCount = 0;
-        int size = 0;    
+        int size = 0;
+        boolean pre = false;
        
        // getting the url hash from which the content should be loaded
        String urlHash = post.get("urlHash","");       
@ -124,6 +126,7 @@ public class ViewFile {
            descr = comp.descr();
            urlEntry.wordCount();
            size = urlEntry.size();
+            pre = urlEntry.flags().get(plasmaCondenser.flag_cat_indexof);
        }

        // alternatively, get the url simply from a url String
@ -140,6 +143,7 @@ public class ViewFile {

            // define an url by post parameter
            url = new URL(urlString);
+            pre = post.get("pre", "false").equals("true");
        } catch (MalformedURLException e) {}
        
        
@ -303,14 +307,13 @@ public class ViewFile {
                prop.put("viewMode_parsedText", content);
            } else {
                prop.put("viewMode", VIEW_MODE_AS_PARSED_SENTENCES);
-                final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+                final Enumeration sentences = document.getSentences(pre);

                boolean dark = true;
                int i = 0;
                if (sentences != null)
                    while (sentences.hasMoreElements()) {
-                        String currentSentence = wikiCode
-                                .replaceHTML((String) sentences.nextElement());
+                        String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());

                        // Search word highlighting
                        String words = post.get("words", null);
--- a/htroot/htdocsdefault/dir.java
+++ b/htroot/htdocsdefault/dir.java
@ -360,7 +360,7 @@ public class dir {
    public static void indexPhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr, byte[] md5) {
        try {
            final URL url = new URL(urlstring);
-            final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()));
+            final plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(("yacyshare. " + phrase + ". " + descr).getBytes()), "UTF-8");
            final indexURLEntry newEntry = switchboard.urlPool.loadedURL.newEntry(
                url,
                "YaCyShare: " + descr,
@ -395,7 +395,7 @@ public class dir {
    public static void deletePhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) {
        try {
            final String urlhash = plasmaURL.urlHash(new URL(urlstring));
-            final Iterator words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"));
+            final Iterator words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes("UTF-8"), "UTF-8");
            Map.Entry entry;
            while (words.hasNext()) {
                entry = (Map.Entry) words.next();
--- a/htroot/xml/snippet.java
+++ b/htroot/xml/snippet.java
@ -28,6 +28,9 @@ public class snippet {
        // if 'remove' is set to true, then RWI references to URLs that do not have the snippet are removed
        boolean remove = post.get("remove", "false").equals("true");
        
+        // boolean line_end_with_punctuation
+        boolean pre = post.get("pre", "false").equals("true");
+        
        String querystring = post.get("search", "").trim();
        if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
            querystring = querystring.substring(1, querystring.length() - 1).trim();
@ -40,10 +43,9 @@ public class snippet {
            kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
        }        
        
-        // do the search
-        Set queryHashes = plasmaCondenser.words2hashes(query);
-        
-        plasmaSnippetCache.Snippet snippet = switchboard.snippetCache.retrieveSnippet(url, queryHashes, true, 260, 10000);
+        // find snippet
+        Set queryHashes = plasmaCondenser.words2hashes(query);        
+        plasmaSnippetCache.Snippet snippet = switchboard.snippetCache.retrieveSnippet(url, queryHashes, true, pre, 260, 10000);
        prop.put("status",snippet.getSource());
        if (snippet.getSource() < 11) {
            //prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@ -56,6 +56,7 @@ import de.anomic.kelondro.kelondroBitfield;
 import de.anomic.index.indexContainer;
 import de.anomic.plasma.plasmaURL;
 import de.anomic.index.indexURLEntry;
+import de.anomic.plasma.plasmaCondenser;
 import de.anomic.plasma.plasmaSearchEvent;
 import de.anomic.plasma.plasmaSearchQuery;
 import de.anomic.plasma.plasmaSearchRankingProfile;
@ -256,7 +257,7 @@ public final class search {
            while ((acc.hasMoreElements()) && (i < squery.wantedResults)) {
                urlentry = (indexURLEntry) acc.nextElement();
                if (includesnippet) {
-                    snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, 260, 1000);
+                    snippet = sb.snippetCache.retrieveSnippet(urlentry.comp().url(), squery.queryHashes, false, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), 260, 1000);
                } else {
                    snippet = null;
                }
--- a/source/de/anomic/index/indexURLEntry.java
+++ b/source/de/anomic/index/indexURLEntry.java
@ -31,6 +31,7 @@ import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.Date;

+import de.anomic.kelondro.kelondroBitfield;
 import de.anomic.kelondro.kelondroRow;
 import de.anomic.net.URL;
 import de.anomic.index.indexRWIEntry;
@ -49,6 +50,7 @@ public interface indexURLEntry {
    public int size();
    public int wordCount();
    public String snippet();
+    public kelondroBitfield flags();
    public indexRWIEntry word();
    public boolean isOlder(indexURLEntry other);
    public String toString(String snippet);
--- a/source/de/anomic/index/indexURLEntryOld.java
+++ b/source/de/anomic/index/indexURLEntryOld.java
@ -35,6 +35,7 @@ import de.anomic.kelondro.kelondroBase64Order;
 import de.anomic.kelondro.kelondroBitfield;
 import de.anomic.kelondro.kelondroRow;
 import de.anomic.net.URL;
+import de.anomic.plasma.plasmaSearchQuery;
 import de.anomic.plasma.plasmaURL;
 import de.anomic.server.logging.serverLog;
 import de.anomic.tools.crypt;
@ -262,6 +263,10 @@ public class indexURLEntryOld implements indexURLEntry {
    public int wordCount() {
        return wordCount;
    }
+    
+    public kelondroBitfield flags() {
+        return plasmaSearchQuery.empty_constraint;
+    }

    public String snippet() {
        // the snippet may appear here if the url was transported in a remote search
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -107,7 +107,7 @@ public final class plasmaCondenser {
    private int wordminsize;
    private int wordcut;

-    public int RESULT_NUMB_TEXT_BYTES = -1;
+    //public int RESULT_NUMB_TEXT_BYTES = -1;
    public int RESULT_NUMB_WORDS = -1;
    public int RESULT_DIFF_WORDS = -1;
    public int RESULT_SIMI_WORDS = -1;
@ -117,17 +117,17 @@ public final class plasmaCondenser {
    public int RESULT_SIMI_SENTENCES = -1;
    public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
    
-    public plasmaCondenser(InputStream text) {
-        this(text, 3, 2);
+    public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException {
+        this(text, charset, 3, 2);
    }

-    public plasmaCondenser(InputStream text, int wordminsize, int wordcut) {
+    public plasmaCondenser(InputStream text, String charset, int wordminsize, int wordcut) throws UnsupportedEncodingException {
        this.wordminsize = wordminsize;
        this.wordcut = wordcut;
        // analysis = new Properties();
        words = new TreeMap();
        sentences = new HashMap();
-        createCondensement(text);
+        createCondensement(text, charset);
    }

    // create a word hash
@ -225,7 +225,7 @@ public final class plasmaCondenser {
        return s;
    }

-    private void createCondensement(InputStream is) {
+    private void createCondensement(InputStream is, String charset) throws UnsupportedEncodingException {

        words = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/);
        sentences = new HashMap();
@ -244,10 +244,10 @@ public final class plasmaCondenser {
        int idx;
        int wordInSentenceCounter = 1;
        Iterator it, it1;
-        boolean comb_indexof = false, comb_lastmodified = false, last_last = false, last_index = false;
+        boolean comb_indexof = false, last_last = false, last_index = false;
        
        // read source
-        sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize);
+        sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize);
        while (wordenum.hasMoreElements()) {
            word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
            //System.out.println("PARSED-WORD " + word);
@ -285,7 +285,10 @@ public final class plasmaCondenser {
                wordInSentenceCounter = 1;
            } else {
                // check index.of detection
-                if ((last_last) && (word.equals("modified"))) comb_lastmodified = true;
+                if ((last_last) && (comb_indexof) && (word.equals("modified"))) {
+                    this.RESULT_FLAGS.set(flag_cat_indexof, true);
+                    wordenum.pre(true); // parse lines as they come with CRLF
+                }
                if ((last_index) && (word.equals("of"))) comb_indexof = true;
                last_last = word.equals("last");
                last_index = word.equals("index");
@ -412,7 +415,7 @@ public final class plasmaCondenser {
        }

        // store result
-        this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
+        //this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
        this.RESULT_NUMB_WORDS = allwordcounter;
        this.RESULT_DIFF_WORDS = wordHandleCount;
        this.RESULT_SIMI_WORDS = words.size();
@ -420,7 +423,6 @@ public final class plasmaCondenser {
        this.RESULT_NUMB_SENTENCES = allsentencecounter;
        this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
        this.RESULT_SIMI_SENTENCES = sentences.size();
-        this.RESULT_FLAGS.set(flag_cat_indexof, comb_indexof && comb_lastmodified);
    }

    public void print() {
@ -544,10 +546,9 @@ public final class plasmaCondenser {
        return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
    }

-    public static Enumeration wordTokenizer(String s, int minLength) {
+    public static Enumeration wordTokenizer(String s, String charset, int minLength) {
        try {
-            // TODO: Bugfix for UTF-8 needed
-            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), minLength);
+            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), charset, minLength);
        } catch (Exception e) {
            return null;
        }
@ -560,13 +561,17 @@ public final class plasmaCondenser {
        unsievedWordsEnum e;
        int ml;

-        public sievedWordsEnum(InputStream is, int minLength) {
-            e = new unsievedWordsEnum(is);
+        public sievedWordsEnum(InputStream is, String charset, int minLength) throws UnsupportedEncodingException {
+            e = new unsievedWordsEnum(is, charset);
            buffer = nextElement0();
            ml = minLength;
        }

-	    private Object nextElement0() {
+        public void pre(boolean x) {
+            e.pre(x);
+        }
+        
+        private Object nextElement0() {
            String s;
            char c;
            loop: while (e.hasMoreElements()) {
@ -596,23 +601,24 @@ public final class plasmaCondenser {
            return r;
        }

-        public int count() {
-            return e.count();
-        }
    }

    private static class unsievedWordsEnum implements Enumeration {
        
        Object buffer = null;
-        linesFromFileEnum e;
+        sentencesFromInputStreamEnum e;
        String s;

-        public unsievedWordsEnum(InputStream is) {
-            e = new linesFromFileEnum(is);
+        public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException {
+            e = new sentencesFromInputStreamEnum(is, charset);
            s = "";
            buffer = nextElement0();
        }

+        public void pre(boolean x) {
+            e.pre(x);
+        }
+        
        private Object nextElement0() {
            String r;
            StringBuffer sb;
@ -656,66 +662,9 @@ public final class plasmaCondenser {
            return r;
        }

-        public int count() {
-            return e.count();
-        }
-    }
-
-    private static class linesFromFileEnum implements Enumeration {
-        // read in lines from a given input stream
-        // every line starting with a '#' is treated as a comment.
-
-        Object buffer = null;
-        BufferedReader raf;
-        int counter = 0;
-
-        public linesFromFileEnum(InputStream is) {
-            raf = new BufferedReader(new InputStreamReader(is)); // TODO: bugfix needed for UTF-8, use charset for reader
-            buffer = nextElement0();
-            counter = 0;
-        }
-
-        private Object nextElement0() {
-            try {
-                String s;
-                while (true) {
-                    s = raf.readLine();
-                    if (s == null) {
-                        raf.close();
-                        return null;
-                    }
-                    if (!(s.startsWith("#"))) return s;
-                }
-            } catch (IOException e) {
-                try {
-                    raf.close();
-                } catch (Exception ee) {
-                }
-                return null;
-            }
-        }
-
-        public boolean hasMoreElements() {
-            return buffer != null;
-        }
-
-        public Object nextElement() {
-            if (buffer == null) {
-                return null;
-            } else {
-                counter = counter + ((String) buffer).length() + 1;
-                Object r = buffer;
-                buffer = nextElement0();
-                return r;
-            }
-        }
-
-        public int count() {
-            return counter;
-        }
    }
    
-    public static Enumeration sentencesFromInputStream(InputStream is, String charset) {
+    public static sentencesFromInputStreamEnum sentencesFromInputStream(InputStream is, String charset) {
        try {
            return new sentencesFromInputStreamEnum(is, charset);
        } catch (UnsupportedEncodingException e) {
@ -723,23 +672,29 @@ public final class plasmaCondenser {
        }
    }
    
-    private static class sentencesFromInputStreamEnum implements Enumeration {
+    public static class sentencesFromInputStreamEnum implements Enumeration {
        // read sentences from a given input stream
        // this enumerates String objects
        
        Object buffer = null;
        BufferedReader raf;
        int counter = 0;
+        boolean pre = false;

        public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException {
            raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
            buffer = nextElement0();
            counter = 0;
+            pre = false;
        }

+        public void pre(boolean x) {
+            this.pre = x;
+        }
+        
        private Object nextElement0() {
            try {
-                String s = readSentence(raf);
+                String s = readSentence(raf, pre);
                //System.out.println(" SENTENCE='" + s + "'"); // DEBUG 
                if (s == null) {
                    raf.close();
@ -775,7 +730,7 @@ public final class plasmaCondenser {
        }
    }

-    static String readSentence(Reader reader) throws IOException {
+    static String readSentence(Reader reader, boolean pre) throws IOException {
        StringBuffer s = new StringBuffer();
        int nextChar;
        char c;
@ -789,7 +744,11 @@ public final class plasmaCondenser {
            }
            c = (char) nextChar;
            s.append(c);
-            if (htmlFilterContentScraper.punctuation(c)) break;
+            if (pre) {
+                if ((c == (char) 10) || (c == (char) 13)) break;
+            } else {
+                if (htmlFilterContentScraper.punctuation(c)) break;
+            }
        }

        // replace line endings and tabs by blanks
@ -802,16 +761,16 @@ public final class plasmaCondenser {
        
    }
    
-    public static Iterator getWords(InputStream input) {
+    public static Iterator getWords(InputStream input, String charset) throws UnsupportedEncodingException {
        if (input == null) return null;
-        plasmaCondenser condenser = new plasmaCondenser(input);
+        plasmaCondenser condenser = new plasmaCondenser(input, charset);
        return condenser.words();        
    }
    
-    public static Iterator getWords(byte[] text) {
+    public static Iterator getWords(byte[] text, String charset) throws UnsupportedEncodingException {
        if (text == null) return null;
        ByteArrayInputStream buffer = new ByteArrayInputStream(text);
-        return getWords(buffer);
+        return getWords(buffer, charset);
    }
        
    public static void main(String[] args) {
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -139,9 +139,12 @@ public final class plasmaParser {
    private static final HashSet mediaExtSet = new HashSet();
    
    /**
-     * A list of image extensions that should be handleable by image viewer apps
+     * A list of image, audio, video and application extensions
     */
    private static final HashSet imageExtSet = new HashSet();
+    private static final HashSet audioExtSet = new HashSet();
+    private static final HashSet videoExtSet = new HashSet();
+    private static final HashSet appsExtSet = new HashSet();
    
    /**
     * This {@link FilenameFilter} is used to find all classes based on there filenames 
@ -169,17 +172,23 @@ public final class plasmaParser {
     * @see #initMediaExt(String)
     */
    static {
+        String apps = "sit,hqx,img,dmg,exe,com,bat,sh";
+        String audio = "mp2,mp3,ogg,aac,aif,aiff,wav";
+        String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v";
+        String image = "jpg,jpeg,jpe,gif,png";
        initMediaExt(extString2extList(
-                "sit,hqx,img,dmg,exe,com,bat,sh" +   // application container
-                "tar,gz,bz2,arj,zip,rar," +          // archive formats
-                "ps,xls,ppt,asf," +                  // text formats without support
-                "mp3,ogg,aac," +                     // audio formats
-                "swf,avi,wmv,rm,mov,mpg,mpeg,ram," + // video formats
-                "jpg,jpeg,jpe,gif,png"               // image formats
-                ));
-        initImageExt(extString2extList(
-                "jpg,jpeg,jpe,gif,png"               // image formats
+                apps + "," +  // application container
+                "tar,gz,bz2,arj,zip,rar," + // archive formats
+                "ps,xls,ppt,asf," +         // text formats without support
+                audio + "," +               // audio formats
+                video + "," +               // video formats
+                image                       // image formats
                ));
+        initImageExt(extString2extList(image));  // image formats
+        initAudioExt(extString2extList(audio));  // audio formats
+        initVideoExt(extString2extList(video));  // video formats
+        initAppsExt(extString2extList(apps));    // application formats
+                
        
        /* ===================================================
         * initializing the parser object pool
@ -272,6 +281,27 @@ public final class plasmaParser {
        }
    }
    
+    public static void initAudioExt(List audioExtList) {
+        synchronized (audioExtSet) {
+            audioExtSet.clear();
+            audioExtSet.addAll(audioExtList);
+        }
+    }
+    
+    public static void initVideoExt(List videoExtList) {
+        synchronized (videoExtSet) {
+            videoExtSet.clear();
+            videoExtSet.addAll(videoExtList);
+        }
+    }
+    
+    public static void initAppsExt(List appsExtList) {
+        synchronized (appsExtSet) {
+            appsExtSet.clear();
+            appsExtSet.addAll(appsExtList);
+        }
+    }
+    
    public static String getMediaExtList() {
        synchronized (mediaExtSet) {
            return mediaExtSet.toString();
@ -343,6 +373,27 @@ public final class plasmaParser {
        }
    }

+    public static boolean audioExtContains(String audioExt) {
+        if (audioExt == null) return false;
+        synchronized (audioExtSet) {
+            return audioExtSet.contains(audioExt.trim().toLowerCase());
+        }
+    }
+
+    public static boolean videoExtContains(String videoExt) {
+        if (videoExt == null) return false;
+        synchronized (videoExtSet) {
+            return videoExtSet.contains(videoExt.trim().toLowerCase());
+        }
+    }
+
+    public static boolean appsExtContains(String appsExt) {
+        if (appsExt == null) return false;
+        synchronized (appsExtSet) {
+            return appsExtSet.contains(appsExt.trim().toLowerCase());
+        }
+    }
+
    public static String getRealCharsetEncoding(String encoding) {
    	if ((encoding == null) || (encoding.length() == 0)) return "ISO-8859-1";
    	
@ -887,7 +938,7 @@ public final class plasmaParser {
                System.out.println(document.getMainLongTitle());
                
                // found text
-                final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+                final Enumeration sentences = document.getSentences(false);
                int i = 0;
                if (sentences != null) while (sentences.hasMoreElements()) {
                        System.out.print("line " + i + ": ");
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -75,8 +75,7 @@ public class plasmaParserDocument {
    // the anchors and images - Maps are URL-to-EntityDescription mappings.
    // The EntityDescription appear either as visible text in anchors or as alternative
    // text in image tags.
-    Map hyperlinks;
-    Map medialinks;
+    Map hyperlinks, audiolinks, videolinks, imagelinks, applinks;
    Map emaillinks;
    plasmaCondenser condenser;
    boolean resorted;
@ -98,7 +97,10 @@ public class plasmaParserDocument {
        this.anchors = (anchors==null)?new HashMap(0):anchors;
        this.images = (images==null)?new TreeSet():images;
        this.hyperlinks = null;
-        this.medialinks = null;
+        this.audiolinks = null;
+        this.videolinks = null;
+        this.imagelinks = null;
+        this.applinks = null;
        this.emaillinks = null;
        this.condenser = null;
        this.resorted = false;
@ -121,7 +123,10 @@ public class plasmaParserDocument {
        this.anchors = (anchors==null)?new HashMap(0):anchors;
        this.images = (images==null)?new TreeSet():images;
        this.hyperlinks = null;
-        this.medialinks = null;
+        this.audiolinks = null;
+        this.videolinks = null;
+        this.imagelinks = null;
+        this.applinks = null;
        this.emaillinks = null;
        this.condenser = null;
        this.resorted = false;
@ -190,9 +195,11 @@ public class plasmaParserDocument {
        return -1; 
    }
    
-    public Enumeration getSentences(String charset) {
+    public Enumeration getSentences(boolean pre) {
        if (this.text == null) return null;
-        return plasmaCondenser.sentencesFromInputStream(getText(), charset);
+        plasmaCondenser.sentencesFromInputStreamEnum e = plasmaCondenser.sentencesFromInputStream(getText(), this.charset);
+        e.pre(pre);
+        return e;
    }
    
    public String getKeywords(char separator) {
@ -232,10 +239,24 @@ public class plasmaParserDocument {
        return hyperlinks;
    }
    
-    public Map getMedialinks() {
-        // this is partly subset of getAnchor and getImage: all non-hyperrefs
+    public Map getAudiolinks() {
        if (!resorted) resortLinks();
-        return medialinks;
+        return this.audiolinks;
+    }
+    
+    public Map getVideolinks() {
+        if (!resorted) resortLinks();
+        return this.videolinks;
+    }
+    
+    public Map getImagelinks() {
+        if (!resorted) resortLinks();
+        return this.imagelinks;
+    }
+    
+    public Map getApplinks() {
+        if (!resorted) resortLinks();
+        return this.applinks;
    }
    
    public Map getEmaillinks() {
@ -248,69 +269,70 @@ public class plasmaParserDocument {
        
        // extract hyperlinks, medialinks and emaillinks from anchorlinks
        Iterator i;
-        String url;
+        URL url;
+        String u;
        int extpos, qpos;
        String ext = null;
        i = anchors.entrySet().iterator();
        hyperlinks = new HashMap();
-        medialinks = new HashMap();
+        imagelinks = new HashMap();
+        videolinks = new HashMap();
+        audiolinks = new HashMap();
+        applinks   = new HashMap();
        emaillinks = new HashMap();
        TreeSet collectedImages = new TreeSet(); // this is a set that is collected now and joined later to the imagelinks
        Map.Entry entry;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
-            url = (String) entry.getKey();
-            if ((url != null) && (url.startsWith("mailto:"))) {
-                emaillinks.put(url.substring(7), entry.getValue());
+            u = (String) entry.getKey();
+            if ((u != null) && (u.startsWith("mailto:"))) {
+                emaillinks.put(u.substring(7), entry.getValue());
            } else {
-                extpos = url.lastIndexOf(".");
-                String normal;
+                extpos = u.lastIndexOf(".");
                if (extpos > 0) {
-                    if (((qpos = url.indexOf("?")) >= 0) && (qpos > extpos)) {
-                        ext = url.substring(extpos, qpos).toLowerCase();
+                    if (((qpos = u.indexOf("?")) >= 0) && (qpos > extpos)) {
+                        ext = u.substring(extpos + 1, qpos).toLowerCase();
                    } else {
-                        ext = url.substring(extpos).toLowerCase();
+                        ext = u.substring(extpos + 1).toLowerCase();
                    }
-                    try {normal = new URL(url).toNormalform();} catch (MalformedURLException e1) {
-                        normal = null;
-                    }
-                    if (normal != null) { //TODO: extension function is not correct
-                        if (plasmaParser.mediaExtContains(ext.substring(1))) {
+                    try {
+                        url = new URL(u);
+                        u = url.toNormalform();
+                        if (plasmaParser.mediaExtContains(ext)) {
                            // this is not a normal anchor, its a media link
-                            medialinks.put(normal, entry.getValue());
+                            if (plasmaParser.imageExtContains(ext)) {
+                                imagelinks.put(u, entry.getValue());
+                                collectedImages.add(new htmlFilterImageEntry(url, "", -1, -1));
+                            }
+                            else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, entry.getValue());
+                            else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, entry.getValue());
+                            else if (plasmaParser.appsExtContains(ext)) applinks.put(u, entry.getValue());
                        } else {
-                            hyperlinks.put(normal, entry.getValue());
-                        }
-                        if (plasmaParser.imageExtContains(ext.substring(1))) {
-                            try {
-                                collectedImages.add(new htmlFilterImageEntry(new URL(normal), "", -1, -1));
-                            } catch (MalformedURLException e) {}
+                            hyperlinks.put(u, entry.getValue());
                        }
+                    } catch (MalformedURLException e1) {
                    }
                }
            }
        }
        
-        // add the images to the medialinks
-        i = images.iterator();
-        String normal;
-        htmlFilterImageEntry iEntry;
-        while (i.hasNext()) {
-            iEntry = (htmlFilterImageEntry) i.next();
-            normal = iEntry.url().toNormalform();
-            if (normal != null) medialinks.put(normal, iEntry.alt()); // avoid NullPointerException
-        }
-        
        // expand the hyperlinks:
        // we add artificial hyperlinks to the hyperlink set
        // that can be calculated from given hyperlinks and imagelinks
        hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));
-        hyperlinks.putAll(plasmaParser.allReflinks(medialinks));
+        hyperlinks.putAll(plasmaParser.allReflinks(imagelinks));
+        hyperlinks.putAll(plasmaParser.allReflinks(audiolinks));
+        hyperlinks.putAll(plasmaParser.allReflinks(videolinks));
+        hyperlinks.putAll(plasmaParser.allReflinks(applinks));
        hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));
-        hyperlinks.putAll(plasmaParser.allSubpaths(medialinks));
+        hyperlinks.putAll(plasmaParser.allSubpaths(imagelinks));
+        hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks));
+        hyperlinks.putAll(plasmaParser.allSubpaths(videolinks));
+        hyperlinks.putAll(plasmaParser.allSubpaths(applinks));
        
        // finally add image links that we collected from the anchors to the image map
        i = collectedImages.iterator();
+        htmlFilterImageEntry iEntry;
        while (i.hasNext()) {
            iEntry = (htmlFilterImageEntry) i.next();
            if (!images.contains(iEntry)) images.add(iEntry);
--- a/source/de/anomic/plasma/plasmaSearchResult.java
+++ b/source/de/anomic/plasma/plasmaSearchResult.java
@ -167,13 +167,11 @@ public final class plasmaSearchResult {
        Iterator i = pageAcc.entrySet().iterator();
        HashMap paths = new HashMap(); // a url-subpath to pageAcc-key relation
        Map.Entry entry;
-        String path = null;
        
        // first scan all entries and find all urls that are referenced
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
-            path = urlPath(((indexURLEntry) entry.getValue()).comp().url());
-            paths.put(path, entry.getKey());
+            paths.put(((indexURLEntry) entry.getValue()).comp().url().toNormalform(), entry.getKey());
            //if (path != null) path = shortenPath(path);
            //if (path != null) paths.put(path, entry.getKey());
        }
@ -183,8 +181,7 @@ public final class plasmaSearchResult {
        String shorten;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
-            path = urlPath(((indexURLEntry) entry.getValue()).comp().url());
-            shorten = shortenPath(path);
+            shorten = shortenPath(((indexURLEntry) entry.getValue()).comp().url().toNormalform());
            // scan all subpaths of the url
            while (shorten != null) {
                if (pageAcc.size() <= query.wantedResults) break;
@ -206,7 +203,7 @@ public final class plasmaSearchResult {
        if (pos < 0) return null;
        return path.substring(0, pos);
    }
-    
+    /*
    private static String urlPath(URL url) {
        String port = ((url.getPort() < 0) ? "" : ":" + url.getPort());
        String path = url.getPath();
@ -217,7 +214,7 @@ public final class plasmaSearchResult {
        }
        return url.getHost() + port + path;
    }
-    
+    */
    public Object[] getReferences(int count) {
        // create a list of words that had been computed by statistics over all
        // words that appeared in the url or the description of all urls
@ -260,7 +257,7 @@ public final class plasmaSearchResult {
            String hash, fill;
            String[] paths1 = new String[urls.length]; for (int i = 0; i < urls.length; i++) {
                fill = ""; for (int j = 0; j < 35 - urls[i].toString().length(); j++) fill +=" ";
-                paths1[i] = urlPath(urls[i]);
+                paths1[i] = urls[i].toNormalform();
                hash = plasmaURL.urlHash(urls[i]);
                System.out.println("paths1[" + urls[i] + fill +"] = " + hash + ", typeID=" + plasmaURL.flagTypeID(hash) + ", tldID=" + plasmaURL.flagTLDID(hash) + ", lengthID=" + plasmaURL.flagLengthID(hash) + " / " + paths1[i]);
            }
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -180,7 +180,7 @@ public class plasmaSnippetCache {
        return retrieveFromCache(hashes, plasmaURL.urlHash(url)) != null;
    }
    
-    public Snippet retrieveSnippet(URL url, Set queryhashes, boolean fetchOnline, int snippetMaxLength, int timeout) {
+    public Snippet retrieveSnippet(URL url, Set queryhashes, boolean fetchOnline, boolean pre, int snippetMaxLength, int timeout) {
        // heise = "0OQUNU3JSs05"
        if (queryhashes.size() == 0) {
            //System.out.println("found no queryhashes for URL retrieve " + url);
@ -257,7 +257,7 @@ public class plasmaSnippetCache {
        if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
                
        //System.out.println("loaded document for URL " + url);
-        final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
+        final Enumeration sentences = document.getSentences(pre);
        document.close();
        //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
        if (sentences == null) {
@ -475,7 +475,7 @@ public class plasmaSnippetCache {
    private HashMap hashSentence(String sentence) {
        // generates a word-wordPos mapping
        HashMap map = new HashMap();
-        Enumeration words = plasmaCondenser.wordTokenizer(sentence, 0);
+        Enumeration words = plasmaCondenser.wordTokenizer(sentence, "UTF-8", 0);
        int pos = 0;
        String word;
        while (words.hasMoreElements()) {
@ -640,7 +640,7 @@ public class plasmaSnippetCache {
            urlstring = comp.url().toNormalform();
            if ((urlstring.matches(urlmask)) &&
                (!(existsInCache(comp.url(), queryhashes)))) {
-                new Fetcher(comp.url(), queryhashes, (int) maxTime).start();
+                new Fetcher(comp.url(), queryhashes, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), (int) maxTime).start();
                i++;
            }
        }
@ -650,15 +650,17 @@ public class plasmaSnippetCache {
        URL url;
        Set queryhashes;
        int timeout;
-        public Fetcher(URL url, Set queryhashes, int timeout) {
+        boolean pre;
+        public Fetcher(URL url, Set queryhashes, boolean pre, int timeout) {
            if (url.getHost().endsWith(".yacyh")) return;
            this.url = url;
            this.queryhashes = queryhashes;
            this.timeout = timeout;
+            this.pre = pre;
        }
        public void run() {
            log.logFine("snippetFetcher: try to get URL " + url);
-            plasmaSnippetCache.Snippet snippet = retrieveSnippet(url, queryhashes, true, 260, timeout);
+            plasmaSnippetCache.Snippet snippet = retrieveSnippet(url, queryhashes, true, pre, 260, timeout);
            if (snippet.line == null)
                log.logFine("snippetFetcher: cannot get URL " + url + ". error(" + snippet.source + "): " + snippet.error);
            else
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -106,6 +106,7 @@ package de.anomic.plasma;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
 import java.lang.reflect.Constructor;
 import java.net.InetAddress;
 import java.net.MalformedURLException;
@ -1564,10 +1565,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                
                checkInterruption();
                log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
-                plasmaCondenser condenser = new plasmaCondenser(document.getText());
+                plasmaCondenser condenser = new plasmaCondenser(document.getText(), document.charset);
                
                // generate citation reference
-                Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser);
+                Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
                
                try {        
                    // check for interruption
@ -1575,22 +1576,27 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                    
                    // create a new loaded URL db entry
                    indexURLEntry newEntry = urlPool.loadedURL.newEntry(
-                            entry.url(),                                         // URL
-                            docDescription,                                      // document description
-                            "",                                                  // author
-                            "",                                                  // tags
-                            "",                                                  // ETag
-                            docDate,                                             // modification date
-                            new Date(),                                          // loaded date
-                            new Date(),                                          // freshdate 
-                            referrerUrlHash,                                     // referer hash
-                            new byte[0],                                         // md5
-                            (int) entry.size(),                                  // size
-                            condenser.RESULT_NUMB_WORDS,                         // word count
-                            plasmaURL.docType(document.getMimeType()),           // doctype
-                            condenser.RESULT_FLAGS,                              // flags
-                            plasmaURL.language(entry.url()),                     // language
-                            0,0,0,0,0,0
+                            entry.url(),                               // URL
+                            docDescription,                            // document description
+                            "",                                        // author
+                            "",                                        // tags
+                            "",                                        // ETag
+                            docDate,                                   // modification date
+                            new Date(),                                // loaded date
+                            new Date(),                                // freshdate 
+                            referrerUrlHash,                           // referer hash
+                            new byte[0],                               // md5
+                            (int) entry.size(),                        // size
+                            condenser.RESULT_NUMB_WORDS,               // word count
+                            plasmaURL.docType(document.getMimeType()), // doctype
+                            condenser.RESULT_FLAGS,                    // flags
+                            plasmaURL.language(entry.url()),           // language
+                            ioLinks[0].intValue(),                     // llocal
+                            ioLinks[1].intValue(),                     // lother
+                            document.audiolinks.size(),                // laudio
+                            document.imagelinks.size(),                // limage
+                            document.videolinks.size(),                // lvideo
+                            document.applinks.size()                   // lapp
                    );
                    /* ========================================================================
                     * STORE URL TO LOADED-URL-DB
@ -1598,7 +1604,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                    urlPool.loadedURL.store(newEntry);
                    urlPool.loadedURL.stack(
                            newEntry,                       // loaded url db entry
-                            initiatorPeerHash,                  // initiator peer hash
+                            initiatorPeerHash,              // initiator peer hash
                            yacyCore.seedDB.mySeed.hash,    // executor peer hash
                            processCase                     // process case
                    );                    
@ -2094,7 +2100,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                        filename = comp.url().getFile();
                        if ((seed == null) || ((address = seed.getAddress()) == null)) {
                            // seed is not known from here
-                            removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes()));
+                            removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + comp.descr()).getBytes(), "UTF-8"));
                            urlPool.loadedURL.remove(urlentry.hash()); // clean up
                            continue; // next result
                        }
@ -2121,7 +2127,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                    URL wordURL;
                    if (urlstring.matches(query.urlMask)) { //.* is default
                        if (includeSnippets) {
-                            snippet = snippetCache.retrieveSnippet(comp.url(), query.queryHashes, false, 260, 1000);
+                            snippet = snippetCache.retrieveSnippet(comp.url(), query.queryHashes, false, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), 260, 1000);
                        } else {
                            snippet = null;
                        }
@ -2237,10 +2243,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            InputStream docBodyInputStream = document.getText();
            
            // getting word iterator
-            Iterator witer = plasmaCondenser.getWords(docBodyInputStream);
+            Iterator witer = null;
+            try {
+                witer = plasmaCondenser.getWords(docBodyInputStream, document.charset);
+            } catch (UnsupportedEncodingException e) {
+                e.printStackTrace();
+            }
            
            // delete all word references
-            int count = removeReferences(urlhash, witer);
+            int count = 0;
+            if (witer != null) count = removeReferences(urlhash, witer);
            
            // finally delete the url entry itself
            urlPool.loadedURL.remove(urlhash);
--- a/source/de/anomic/plasma/plasmaURL.java
+++ b/source/de/anomic/plasma/plasmaURL.java
@ -420,8 +420,7 @@ public class plasmaURL {
            tld = host.substring(p + 1);
            dom = host.substring(0, p);
        }
-        Integer ID = (serverCore.isNotLocal(tld)) ? (Integer) TLDID.get(tld)
-                : null; // identify local addresses
+        Integer ID = (serverCore.isNotLocal(tld)) ? (Integer) TLDID.get(tld) : null; // identify local addresses
        int id = (ID == null) ? 7 : ID.intValue(); // local addresses are flagged with id=7
        boolean isHTTP = url.getProtocol().equals("http");
        p = dom.lastIndexOf('.'); // locate subdomain
--- a/source/de/anomic/server/serverCore.java
+++ b/source/de/anomic/server/serverCore.java
@ -414,19 +414,21 @@ public final class serverCore extends serverAbstractThread implements serverThre
    public static boolean isNotLocal(String ip) {
        // generate ip address if ip is given by host
        assert (ip != null);
+        
+        // check local ip addresses
+        if ((ip.equals("localhost")) ||
+            (ip.startsWith("127")) ||
+            (ip.startsWith("192.168")) ||
+            (ip.startsWith("10."))
+           ) return false;
+        
+        // make a dns resolve
        final InetAddress clientAddress = httpc.dnsResolve(ip);
        if (clientAddress != null) {
            if ((clientAddress.isAnyLocalAddress()) || (clientAddress.isLoopbackAddress())) return false;
            if (ip.charAt(0) > '9') ip = clientAddress.getHostAddress();
        }
        
-        // check local ip addresses
-        if ((ip.equals("localhost")) ||
-            (ip.startsWith("127")) ||
-	        (ip.startsWith("192.168")) ||
-	        (ip.startsWith("10."))
-	       ) return false;
-        
        // finally check if there are other local IP adresses that are not in the standard IP range
        for (int i = 0; i < localAddresses.length; i++) {
            if (localAddresses[i].equals(clientAddress)) return false;
--- a/yacy.init
+++ b/yacy.init
@ -188,7 +188,7 @@ parseableMimeTypes.URLREDIRECTOR=
 # a comma-separated list of extensions that denote media file formats
 # this is important to recognize <a href> - tags as not-html reference
 # These files will be excluded from indexing _(Please keep extensions in alphabetical order)_
-mediaExt=7z,ace,arj,asf,asx,avi,bin,bz2,css,db,dcm,deb,doc,dll,dmg,gif,gz,hqx,ico,img,iso,jar,jpe,jpg,jpeg,lx,lxl,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,scr,sit,so,swf,sxc,sxd,sxi,sxw,tar,tbz,tgz,torrent,war,wmv,xcf,xls,zip
+mediaExt=7z,ace,aif,aiff,arj,asf,asx,avi,bin,bz2,css,db,dcm,deb,doc,dll,dmg,gif,gz,hqx,ico,img,iso,jar,jpe,jpg,jpeg,lx,lxl,m4v,mpeg,mov,mp3,mpg,ogg,png,pdf,ppt,ps,ram,rar,rm,rpm,scr,sit,so,swf,sxc,sxd,sxi,sxw,tar,tbz,tgz,torrent,war,wav,wmv,xcf,xls,zip
 parseableExt=html,htm,txt,php,shtml,asp,aspx,jsp

 # Promotion Strings