*) adding an new package for extra content parsers

*) adding content parser for - pdf (using the pdf-box library) - doc (using the textmining.org library) *) adding a Interface for content parsers *) adding a configuration file which can be used to configure which parser is used for which mimeType *) Sempahore class was moved and renamed to serverSemaphore *) Changing yacy shutdown behaviour Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · 58b1a0ba40
parent 17d993cfee
commit 58b1a0ba40
14 changed files with 909 additions and 250 deletions
--- a/build.xml
+++ b/build.xml
@ -5,6 +5,7 @@
 <property name="src" location="source"/>
 <property name="lib" location="lib"/>
 <property name="libx" location="libx"/>
 <property name="build" location="classes"/>
 <property name="htroot" location="htroot"/>
@ -39,6 +40,13 @@
        	<!-- libs needed for the yacy thread/object-pools -->
          	<pathelement location="${lib}/commons-collections.jar" />
        	<pathelement location="${lib}/commons-pool-1.2.jar" />        	
        	<!-- libs needed to parse pdf files -->
          	<pathelement location="${libx}/PDFBox-0.7.1.jar" />        	
          	<pathelement location="${libx}/log4j-1.2.9.jar" />    
        	<!-- libs needed for parsing doc files -->
          	<pathelement location="${libx}/tm-extractors-0.4.jar" />         		
        </classpath>		
 	</javac>	
 	<javac srcdir="${htroot}/" destdir="${htroot}" classpath="${build}"/>
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -114,7 +114,7 @@ public class CacheAdmin_p {
                    else {
                        htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
                        OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
-                        plasmaParser.document document = switchboard.parser.transformScraper(url, "text/html", scraper);
+                        plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
                        serverFileUtils.copy(file, os);
                        info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>";
                        info += "<b>HREF:</b><br>" + formatAnchor(document.getHyperlinks()) + "<br>";
--- a/htroot/Steering.java
+++ b/htroot/Steering.java
@ -73,7 +73,7 @@ public class Steering {
        }
 	if (post.containsKey("shutdown")) {
-            switchboard.terminate = true;
+            switchboard.terminate();
            prop.put("info", 3);//shutting down
            return prop;
        }
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@ -0,0 +1,59 @@
 /*
 * Check4Update is a stand-alone server application that can be used to 
 * monitor various types of online resources for updates and changes and
 * notifies the user if a modification was detected.
 * 
 * Copyright (C) 2005 Martin Thelian
 * 
 * This program is free software; you can redistribute it and/or modify 
 * it under the terms of the GNU General Public License as published by 
 * the Free Software Foundation; either version 2 of the License, or (at 
 * your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but 
 * WITHOUT ANY WARRANTY; without even the implied warranty of 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License 
 * along with this program; if not, write to the Free Software Foundation, 
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 * 
 * For more information, please email thelian@users.sourceforge.net
 * 
 */ 
 /* =======================================================================
 * Revision Control Information
 * $Source: $
 * $Author: $
 * $Date: $
 * $Revision: $
 * ======================================================================= */
 package de.anomic.plasma.parser;
 import java.io.File;
 import java.io.InputStream;
 import java.net.URL;
 import java.util.HashSet;
 import de.anomic.plasma.plasmaParserDocument;
 public interface Parser {
    public plasmaParserDocument parse(URL location, String mimeType, byte[] source)
    throws ParserException;
    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile)
    throws ParserException;
    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) 
    throws ParserException;
    public HashSet getSupportedMimeTypes();
    public void reset();
 }
--- a/source/de/anomic/plasma/parser/ParserException.java
+++ b/source/de/anomic/plasma/parser/ParserException.java
@ -0,0 +1,21 @@
 package de.anomic.plasma.parser;
 public class ParserException extends Exception
 {
    public ParserException() {
        super();
    }
    public ParserException(String message) {
        super(message);
    }
    public ParserException(String message, Throwable cause) {
        super(message, cause);
    }
    public ParserException(Throwable cause) {
        super(cause);
    }
 }
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@ -0,0 +1,127 @@
 /*
 * Check4Update is a stand-alone server application that can be used to 
 * monitor various types of online resources for updates and changes and
 * notifies the user if a modification was detected.
 * 
 * Copyright (C) 2005 Martin Thelian
 * 
 * This program is free software; you can redistribute it and/or modify 
 * it under the terms of the GNU General Public License as published by 
 * the Free Software Foundation; either version 2 of the License, or (at 
 * your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but 
 * WITHOUT ANY WARRANTY; without even the implied warranty of 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License 
 * along with this program; if not, write to the Free Software Foundation, 
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 * 
 * For more information, please email thelian@users.sourceforge.net
 * 
 */ 
 /* =======================================================================
 * Revision Control Information
 * $Source: $
 * $Author: $
 * $Date: $
 * $Revision: $
 * ======================================================================= */
 package de.anomic.plasma.parser.doc;
 import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.InputStream;
 import java.net.URL;
 import java.util.Arrays;
 import java.util.HashSet;
 import org.textmining.text.extraction.WordExtractor;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.parser.Parser;
 import de.anomic.plasma.parser.ParserException;
 public class docParser implements Parser {
    /**
     * a list of mime types that are supported by this parser class
     */
    public static final HashSet<String> SUPPORTED_MIME_TYPES = new HashSet<String>(Arrays.asList(new String[] {
        new String("application/msword")
    }));     
 	public docParser() {
 		super();
 	}
 	public plasmaParserDocument parse(URL location, String mimeType,
 			byte[] source) throws ParserException {
        ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);
        return this.parse(location,mimeType,contentInputStream);
 	}
 	public plasmaParserDocument parse(URL location, String mimeType,
 			File sourceFile) throws ParserException {
        BufferedInputStream contentInputStream = null;
        try {
            contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        return this.parse(location, mimeType, contentInputStream);
 	}
 	public plasmaParserDocument parse(URL location, String mimeType,
 			InputStream source) throws ParserException {
 		try {	
 			  WordExtractor extractor = new WordExtractor();
 			  String contents = extractor.extractText(source);
              plasmaParserDocument theDoc = new plasmaParserDocument(
                      location,
                      mimeType,
                      null,
                      null,
                      null,
                      null,
                      null,
                      contents.getBytes(),
                      null,
                      null);
              return theDoc;             
 		}
 		catch (Exception e) {			
 			throw new ParserException("Unable to parse the doc content. " + e.getMessage());
 		}        
 	}
 	public HashSet getSupportedMimeTypes() {
 		return docParser.SUPPORTED_MIME_TYPES;
 	}
 	public void reset() {
 		// TODO Auto-generated method stub
 	}
 	/**
 	 * @param args
 	 */
 	public static void main(String[] args) {
 		// TODO Auto-generated method stub
 	}
 }
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@ -0,0 +1,119 @@
 package de.anomic.plasma.parser.pdf;
 import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.InputStream;
 import java.io.OutputStreamWriter;
 import java.net.URL;
 import java.util.Arrays;
 import java.util.HashSet;
 import org.pdfbox.pdfparser.PDFParser;
 import org.pdfbox.pdmodel.PDDocument;
 import org.pdfbox.pdmodel.PDDocumentInformation;
 import org.pdfbox.util.PDFTextStripper;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.parser.Parser;
 import de.anomic.plasma.parser.ParserException;
 public class pdfParser implements Parser
 {
    /**
     * a list of mime types that are supported by this parser class
     */
    public static final HashSet<String> SUPPORTED_MIME_TYPES = new HashSet<String>(Arrays.asList(new String[] {
        new String("application/pdf")
    }));    
    public pdfParser() {
        super();
    }
    public HashSet getSupportedMimeTypes() {
        return SUPPORTED_MIME_TYPES;
    }
    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException {
        BufferedInputStream contentInputStream = null;
        try {
            contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        return this.parse(location, mimeType, contentInputStream);
    }
    public plasmaParserDocument parse(URL location, String mimeType, byte[] source) throws ParserException {
        ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);
        return this.parse(location,mimeType,contentInputStream);
    }    
    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
        try {       
            String docTitle = null, docSubject = null, docAuthor = null, docKeyWords = null;
            PDFParser parser = new PDFParser(source);
            parser.parse();
            PDFTextStripper stripper = new PDFTextStripper();
            PDDocument theDocument = parser.getPDDocument();
            PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
            if (theDocInfo != null)
            {
                docTitle = theDocInfo.getTitle();
                docSubject = theDocInfo.getSubject();
                docAuthor = theDocInfo.getAuthor();
                docKeyWords = theDocInfo.getKeywords();
            }
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            OutputStreamWriter writer = new OutputStreamWriter( out );            
            stripper.writeText(theDocument, writer );
            writer.close();
            theDocument.close();
            byte[] contents = out.toByteArray();
            /*
             *         public document(URL location, String mimeType,
                            String keywords, String shortTitle, String longTitle,
                            String[] sections, String abstrct,
                            byte[] text, Map anchors, Map images) {
             * 
             */            
            plasmaParserDocument theDoc = new plasmaParserDocument(
                    location,
                    mimeType,
                    docKeyWords,
                    docSubject,
                    docTitle,
                    null,
                    null,
                    contents,
                    null,
                    null);
            return theDoc;
        }
        catch (Exception e) {            
            throw new ParserException("Unable to parse the pdf content. " + e.getMessage());
        }        
    }
    public void reset() {
    	// TODO Auto-generated method stub
    }
 }
--- a/source/de/anomic/plasma/plasmaCrawlLoader.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoader.java
@ -192,75 +192,14 @@ public final class plasmaCrawlLoader extends Thread {
 }
 final class Semaphore  {
   private long currentValue = 0;    
   private long maximumValue = Long.MAX_VALUE;
   protected Semaphore()  {
       this(0,Long.MAX_VALUE);
   }
   public Semaphore(long initialValue)  {
       this(initialValue,Long.MAX_VALUE);
   }    
   protected Semaphore(long initialValue, long maxValue) {
       /* some errorhandling */
       if (maxValue < initialValue) {
           throw new IllegalArgumentException("The semaphore maximum value must not be " +
                                              "greater than the semaphore init value.");
       }
       if (maxValue < 1)  {
            throw new IllegalArgumentException("The semaphore maximum value must be greater or equal 1.");          
       }
       if (initialValue < 0) {
            throw new IllegalArgumentException("The semaphore initial value must be greater or equal 0.");          
       }
       // setting the initial Sempahore Values
       this.currentValue = initialValue;
       this.maximumValue = maxValue;        
   }
   public synchronized void P() throws InterruptedException
   {   
        this.currentValue-- ;           
        if (this.currentValue < 0) {    
            try  { 
                wait();
            } catch(InterruptedException e) { 
                this.currentValue++;
                throw e;
            }
        }
   }
   public synchronized void V() {
       if (this.currentValue+1 == this.maximumValue) {
             throw new IndexOutOfBoundsException("The maximum value of the semaphore was reached");
       }        
       this.currentValue++;        
       if (this.currentValue <= 0) {
           notify();
       }   
    }
 }
 class CrawlerMessageQueue {
-    private final Semaphore readSync;
+    private final serverSemaphore readSync;
-    private final Semaphore writeSync;
+    private final serverSemaphore writeSync;
    private final ArrayList messageList;
    public CrawlerMessageQueue()  {
-        this.readSync  = new Semaphore (0);
+        this.readSync  = new serverSemaphore (0);
-        this.writeSync = new Semaphore (1);
+        this.writeSync = new serverSemaphore (1);
        this.messageList = new ArrayList(10);        
    }
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -45,52 +45,155 @@ package de.anomic.plasma;
 import java.io.*;
 import java.net.*;
 import java.util.*;
-import de.anomic.server.*;
+
 import org.apache.commons.pool.KeyedPoolableObjectFactory;
 import org.apache.commons.pool.impl.GenericKeyedObjectPool;
 import org.apache.commons.pool.impl.GenericObjectPool;
 import de.anomic.plasma.parser.Parser;
 import de.anomic.server.serverFileUtils;
 import de.anomic.htmlFilter.*;
-public class plasmaParser {
+public final class plasmaParser {
    public static String mediaExt =
        "swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
-        "sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj";
+        "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj";
    private final Properties parserList;
 	private final plasmaParserPool theParserPool;
    public plasmaParser(File parserDispatcherPropertyFile) {
        // this is only a dummy yet because we have only one parser...
        // loading a list of availabe parser from file
    	Properties prop = new Properties();
    	try {
    	    prop.load(new FileInputStream(parserDispatcherPropertyFile));
    	} catch (IOException e) {
    	    System.err.println("ERROR: " + parserDispatcherPropertyFile.toString() + " not found in settings path");
    	}    	
        this.parserList = prop;
        /* 
         * initializing the parser object pool
         */
        GenericKeyedObjectPool.Config config = new GenericKeyedObjectPool.Config();
        // The maximum number of active connections that can be allocated from pool at the same time,
        // 0 for no limit
        config.maxActive = 0;
        // The maximum number of idle connections connections in the pool
        // 0 = no limit.        
        config.maxIdle = 10;    
        config.whenExhaustedAction = GenericObjectPool.WHEN_EXHAUSTED_BLOCK; 
        config.minEvictableIdleTimeMillis = 30000; 
        this.theParserPool = new plasmaParserPool(new plasmaParserFactory(),config);           
        /* testing if all parsers could be loaded properly.
         * This is done now to avoid surprises at runtime. */
        if (this.parserList.size() > 0) {
 			Iterator parserIterator = this.parserList.values().iterator();
            while (parserIterator.hasNext()) {
 				String className = (String) parserIterator.next();
                try {
 					Class.forName(className);
                } catch (Exception e) {
                    // if we could not load the parser we remove it from the parser list ...
                    this.parserList.remove(className);
                }
            }
        }        
    }
    public void close() {
-        // frees resources; does nothing yet
+        // release resources 
-    }
+        try {        
 	        // clearing the parser list
 	        this.parserList.clear();
 	        // closing the parser object pool
 	        this.theParserPool.close();
        } catch (Exception e) {
            //
        }
    }    
-    public document parseSource(URL location, String mimeType, byte[] source) {
+    public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) {
-        // make a scraper and transformer
+        
-        htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
+        Parser theParser = null;
        OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
        try {
            if ((mimeType != null) && (mimeType.indexOf(";") != -1)) {
                mimeType = mimeType.substring(0,mimeType.indexOf(";"));
            }                        
            // getting the correct parser for the given mimeType
            theParser = this.getParser(mimeType);
            // if a parser was found we use it ...
            if (theParser != null) {
                return theParser.parse(location, mimeType,source);
            }
            // ...otherwise we make a html scraper and transformer
            htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
            OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
            hfos.write(source);
            return transformScraper(location, mimeType, scraper);
-        } catch (IOException e) {
+        } catch (Exception e) {
            return null;
        } finally {
            if (theParser != null) {
                try {
                    this.theParserPool.returnObject(mimeType, theParser);
                } catch (Exception e) {
                }
            }
        }
    }
-    public document parseSource(URL location, String mimeType, File sourceFile) {
+    public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) {
-        // make a scraper and transformer
+
-        htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
+        Parser theParser = null;
        OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
        try {
-	    serverFileUtils.copy(sourceFile, hfos);
+            if ((mimeType != null) && (mimeType.indexOf(";") != -1)) {
                mimeType = mimeType.substring(0,mimeType.indexOf(";"));
            }            
            // getting the correct parser for the given mimeType
            theParser = this.getParser(mimeType);
            // if a parser was found we use it ...
            if (theParser != null) {
                return theParser.parse(location, mimeType,sourceFile);
            }    
            // ...otherwise we make a scraper and transformer
            htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
            OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
 			serverFileUtils.copy(sourceFile, hfos);
            return transformScraper(location, mimeType, scraper);
-        } catch (IOException e) {
+        } catch (Exception e) {
            return null;
        } finally {
            if (theParser != null) {
                try {
                    this.theParserPool.returnObject(mimeType, theParser);
                } catch (Exception e) {
                }
            }
        }
    }
-    public document transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
+    public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
        try {
-            return new document(new URL(urlNormalform(location)),
+            return new plasmaParserDocument(new URL(urlNormalform(location)),
                                mimeType, null, null, scraper.getHeadline(),
                                null, null,
                                scraper.getText(), scraper.getAnchors(), scraper.getImages());
@ -99,6 +202,41 @@ public class plasmaParser {
        }
    }
    /**
     * This function is used to determine the parser class that should be used for a given
     * mimetype ...
     * @param mimeType
     * @return
     */
    public Parser getParser(String mimeType) {
        if (mimeType == null) {            
            // TODO: do automatic mimetype detection
            return null;
        }
        try {
 	        if (this.parserList.containsKey(mimeType)) {
 	            String parserClassName = (String)this.parserList.get(mimeType);
                // fetching a new parser object from pool  
 				Parser theParser = (Parser) this.theParserPool.borrowObject(parserClassName);
                // checking if the created parser really supports the given mimetype 
                HashSet supportedMimeTypes = theParser.getSupportedMimeTypes();
                if ((supportedMimeTypes != null) && (supportedMimeTypes.contains(mimeType))) {
 					return theParser;
                }
                this.theParserPool.returnObject(parserClassName,theParser);
 	        }
        } catch (Exception e) {
            System.err.println("ERROR: Unable to load the correct parser for type " + mimeType);
        }
        return null;
    }
    public static String urlNormalform(URL url) {
        if (url == null) return null;
        return urlNormalform(url.toString());
@ -114,160 +252,7 @@ public class plasmaParser {
        return us;
    }   
-    public class document {
+    static Map allReflinks(Map links) {
        URL location;       // the source url
        String mimeType;    // mimeType as taken from http header
        String keywords;    // most resources provide a keyword field
        String shortTitle;  // a shortTitle mostly appears in the window header (border)
        String longTitle;   // the real title of the document, commonly h1-tags
        String[] sections;  // if present: more titles/headlines appearing in the document
        String abstrct;     // an abstract, if present: short content description
        byte[] text;        // the clear text, all that is visible
        Map anchors;        // all links embedded as clickeable entities (anchor tags)
        Map images;         // all visible pictures in document
        // the anchors and images - Maps are URL-to-EntityDescription mappings.
        // The EntityDescription appear either as visible text in anchors or as alternative
        // text in image tags.
        Map hyperlinks;
        Map medialinks;
        Map emaillinks;
        public document(URL location, String mimeType,
                        String keywords, String shortTitle, String longTitle,
                        String[] sections, String abstrct,
                        byte[] text, Map anchors, Map images) {
            this.location = location;
            this.mimeType = mimeType;
            this.keywords = keywords;
            this.shortTitle = shortTitle;
            this.longTitle = longTitle;
            this.sections = sections;
            this.abstrct = abstrct;
            this.text = text;
            this.anchors = anchors;
            this.images = images;
            this.hyperlinks = null;
            this.medialinks = null;
            this.emaillinks = null;
        }
        private String absolutePath(String relativePath) {
            try {
                return urlNormalform(new URL(location, relativePath));
            } catch (Exception e) {
                return "";
            }
        }
        public String getMainShortTitle() {
            if (shortTitle != null) return shortTitle; else return longTitle;
        }
        public String getMainLongTitle() {
            if (longTitle != null) return longTitle; else return shortTitle;
        }
        public String[] getSectionTitles() {
            if (sections != null) return sections; else return new String[]{getMainLongTitle()};
        }
        public String getAbstract() {
            if (abstrct != null) return abstrct; else return getMainLongTitle();
        }
        public byte[] getText() {
            // returns only the clear (visible) text (not the source data)
            return text;
        }
        public Map getAnchors() {
            // returns all links embedded as anchors (clickeable entities)
            return anchors;
        }
        public Map getImages() {
            // returns all links enbedded as pictures (visible iin document)
            return images;
        }
        // the next three methods provide a calculated view on the getAnchors/getImages:
        public Map getHyperlinks() {
            // this is a subset of the getAnchor-set: only links to other hyperrefs
            if (hyperlinks == null) resortLinks();
            return hyperlinks;
        }
        public Map getMedialinks() {
            // this is partly subset of getAnchor and getImage: all non-hyperrefs
            if (medialinks == null) resortLinks();
            return medialinks;
        }
        public Map getEmaillinks() {
            // this is part of the getAnchor-set: only links to email addresses
            if (emaillinks == null) resortLinks();
            return emaillinks;
        }
        private synchronized void resortLinks() {
            Iterator i;
            String url;
            int extpos;
            String ext;
            i = anchors.entrySet().iterator();
            hyperlinks = new HashMap();
            medialinks = new HashMap();
            emaillinks = new HashMap();
            Map.Entry entry;
            while (i.hasNext()) {
                entry = (Map.Entry) i.next();
                url = (String) entry.getKey();
                if ((url != null) && (url.startsWith("mailto:"))) {
                    emaillinks.put(url.substring(7), entry.getValue());
                } else {
                    extpos = url.lastIndexOf(".");
                    String normal;
                    if (extpos > 0) {
                        ext = url.substring(extpos).toLowerCase();
                        normal = urlNormalform(url);
                        if (normal != null) {
                            if (mediaExt.indexOf(ext.substring(1)) >= 0) {
                                // this is not an normal anchor, its a media link
                                medialinks.put(normal, entry.getValue());
                            } else {
                                hyperlinks.put(normal, entry.getValue());
                            }
                        }
                    }
                }
            }
            // finally add the images to the medialinks
            i = images.entrySet().iterator();
            String normal;
            while (i.hasNext()) {
                entry = (Map.Entry) i.next();
                url = (String) entry.getKey();
                normal = urlNormalform(url);
                if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
            }
            expandHyperlinks();
        }
        public synchronized void expandHyperlinks() {
            // we add artificial hyperlinks to the hyperlink set that can be calculated from
            // given hyperlinks and imagelinks
            hyperlinks.putAll(allReflinks(hyperlinks));
            hyperlinks.putAll(allReflinks(medialinks));
            hyperlinks.putAll(allSubpaths(hyperlinks));
            hyperlinks.putAll(allSubpaths(medialinks));
        }
    }
    private static Map allReflinks(Map links) {
        // we find all links that are part of a reference inside a url
        HashMap v = new HashMap();
        Iterator i = links.keySet().iterator();
@ -293,7 +278,7 @@ public class plasmaParser {
        return v;
    }
-    private static Map allSubpaths(Map links) {
+    static Map allSubpaths(Map links) {
        HashMap v = new HashMap();
        Iterator i = links.keySet().iterator();
        String s;
@ -312,4 +297,93 @@ public class plasmaParser {
        return v;
    }
    public static void main(String[] args) {
 		try {            
 			plasmaParser theParser = new plasmaParser(new File("yacy.parser"));
            FileInputStream theInput = new FileInputStream(new File("Y:/public_html/test.pdf"));
 			ByteArrayOutputStream theOutput = new ByteArrayOutputStream();
            serverFileUtils.copy(theInput, theOutput);
            theParser.parseSource(new URL("http://brain"),"application/pdf",theOutput.toByteArray());
        } catch (Exception e) {
            e.printStackTrace();
        }
 	}
 }
 final class plasmaParserFactory implements KeyedPoolableObjectFactory {
    public plasmaParserFactory() {
        super();  
    }
    /**
     * @see org.apache.commons.pool.PoolableObjectFactory#makeObject()
     */
    public Object makeObject(Object key) throws Exception {
        if (!(key instanceof String))
            throw new IllegalArgumentException("The object key must be of type string.");
        Class moduleClass = Class.forName((String)key);
        return moduleClass.newInstance();
    }          
     /**
     * @see org.apache.commons.pool.PoolableObjectFactory#destroyObject(java.lang.Object)
     */
    public void destroyObject(Object key, Object obj) {
        if (obj instanceof Parser) {
            Parser theParser = (Parser) obj;
        }
    }
    /**
     * @see org.apache.commons.pool.PoolableObjectFactory#validateObject(java.lang.Object)
     */
    public boolean validateObject(Object key, Object obj) {
        if (obj instanceof Parser) {
            Parser theParser = (Parser) obj;
            return true;
        }
        return true;
    }
    /**
     * @param obj 
     * 
     */
    public void activateObject(Object key, Object obj)  {
        //log.debug(" activateObject...");
    }
    /**
     * @param obj 
     * 
     */
    public void passivateObject(Object key, Object obj) { 
        //log.debug(" passivateObject..." + obj);
        if (obj instanceof Parser)  {
            Parser theParser = (Parser) obj;
            theParser.reset();
        }
    }
 }    
 final class plasmaParserPool extends GenericKeyedObjectPool {
    public plasmaParserPool(plasmaParserFactory objFactory,
            GenericKeyedObjectPool.Config config) {
        super(objFactory, config);
    }
    public Object borrowObject(Object key) throws Exception  {
       return super.borrowObject(key);
    }
    public void returnObject(Object key, Object borrowed) throws Exception  {
        super.returnObject(key,borrowed);
    }        
 }   
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -0,0 +1,192 @@
 /*
 * Check4Update is a stand-alone server application that can be used to 
 * monitor various types of online resources for updates and changes and
 * notifies the user if a modification was detected.
 * 
 * Copyright (C) 2005 Martin Thelian
 * 
 * This program is free software; you can redistribute it and/or modify 
 * it under the terms of the GNU General Public License as published by 
 * the Free Software Foundation; either version 2 of the License, or (at 
 * your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but 
 * WITHOUT ANY WARRANTY; without even the implied warranty of 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License 
 * along with this program; if not, write to the Free Software Foundation, 
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 * 
 * For more information, please email thelian@users.sourceforge.net
 * 
 */ 
 /* =======================================================================
 * Revision Control Information
 * $Source: $
 * $Author: $
 * $Date: $
 * $Revision: $
 * ======================================================================= */
 package de.anomic.plasma;
 import java.net.URL;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 public class plasmaParserDocument {
    URL location;       // the source url
    String mimeType;    // mimeType as taken from http header
    String keywords;    // most resources provide a keyword field
    String shortTitle;  // a shortTitle mostly appears in the window header (border)
    String longTitle;   // the real title of the document, commonly h1-tags
    String[] sections;  // if present: more titles/headlines appearing in the document
    String abstrct;     // an abstract, if present: short content description
    byte[] text;        // the clear text, all that is visible
    Map anchors;        // all links embedded as clickeable entities (anchor tags)
    Map images;         // all visible pictures in document
    // the anchors and images - Maps are URL-to-EntityDescription mappings.
    // The EntityDescription appear either as visible text in anchors or as alternative
    // text in image tags.
    Map hyperlinks;
    Map medialinks;
    Map emaillinks;
    public plasmaParserDocument(URL location, String mimeType,
                    String keywords, String shortTitle, String longTitle,
                    String[] sections, String abstrct,
                    byte[] text, Map anchors, Map images) {
        this.location = location;
        this.mimeType = mimeType;
        this.keywords = keywords;
        this.shortTitle = shortTitle;
        this.longTitle = longTitle;
        this.sections = sections;
        this.abstrct = abstrct;
        this.text = text;
        this.anchors = (anchors==null)?new HashMap():anchors;
        this.images = (images==null)?new HashMap():images;
        this.hyperlinks = null;
        this.medialinks = null;
        this.emaillinks = null;
    }
    private String absolutePath(String relativePath) {
        try {
            return plasmaParser.urlNormalform(new URL(location, relativePath));
        } catch (Exception e) {
            return "";
        }
    }
    public String getMainShortTitle() {
        if (shortTitle != null) return shortTitle; else return longTitle;
    }
    public String getMainLongTitle() {
        if (longTitle != null) return longTitle; else return shortTitle;
    }
    public String[] getSectionTitles() {
        if (sections != null) return sections; else return new String[]{getMainLongTitle()};
    }
    public String getAbstract() {
        if (abstrct != null) return abstrct; else return getMainLongTitle();
    }
    public byte[] getText() {
        // returns only the clear (visible) text (not the source data)
        return text;
    }
    public Map getAnchors() {
        // returns all links embedded as anchors (clickeable entities)
        return anchors;
    }
    public Map getImages() {
        // returns all links enbedded as pictures (visible iin document)
        return images;
    }
    // the next three methods provide a calculated view on the getAnchors/getImages:
    public Map getHyperlinks() {
        // this is a subset of the getAnchor-set: only links to other hyperrefs
        if (hyperlinks == null) resortLinks();
        return hyperlinks;
    }
    public Map getMedialinks() {
        // this is partly subset of getAnchor and getImage: all non-hyperrefs
        if (medialinks == null) resortLinks();
        return medialinks;
    }
    public Map getEmaillinks() {
        // this is part of the getAnchor-set: only links to email addresses
        if (emaillinks == null) resortLinks();
        return emaillinks;
    }
    private synchronized void resortLinks() {
        Iterator i;
        String url;
        int extpos;
        String ext;
        i = anchors.entrySet().iterator();
        hyperlinks = new HashMap();
        medialinks = new HashMap();
        emaillinks = new HashMap();
        Map.Entry entry;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            url = (String) entry.getKey();
            if ((url != null) && (url.startsWith("mailto:"))) {
                emaillinks.put(url.substring(7), entry.getValue());
            } else {
                extpos = url.lastIndexOf(".");
                String normal;
                if (extpos > 0) {
                    ext = url.substring(extpos).toLowerCase();
                    normal = plasmaParser.urlNormalform(url);
                    if (normal != null) {
                        if (plasmaParser.mediaExt.indexOf(ext.substring(1)) >= 0) {
                            // this is not an normal anchor, its a media link
                            medialinks.put(normal, entry.getValue());
                        } else {
                            hyperlinks.put(normal, entry.getValue());
                        }
                    }
                }
            }
        }
        // finally add the images to the medialinks
        i = images.entrySet().iterator();
        String normal;
        while (i.hasNext()) {
            entry = (Map.Entry) i.next();
            url = (String) entry.getKey();
            normal = plasmaParser.urlNormalform(url);
            if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
        }
        expandHyperlinks();
    }
    public synchronized void expandHyperlinks() {
        // we add artificial hyperlinks to the hyperlink set that can be calculated from
        // given hyperlinks and imagelinks
        hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));
        hyperlinks.putAll(plasmaParser.allReflinks(medialinks));
        hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));
        hyperlinks.putAll(plasmaParser.allSubpaths(medialinks));
    }
 }
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -149,7 +149,9 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
    public  kelondroTables         facilityDB;
    public  plasmaParser           parser;
    public  int                    serverJobs;
-    public boolean terminate = false;
+    
    private serverSemaphore shutdownSync = new serverSemaphore(0);
    private boolean terminate = false;
    public plasmaSwitchboard(String rootPath, String initPath, String configPath) throws IOException {
 	super(rootPath, initPath, configPath);
@ -207,7 +209,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
        initProfiles();
        // make parser
-        parser = new plasmaParser(new File(""));
+        parser = new plasmaParser(new File("yacy.parser"));
        // start indexing management
        loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
@ -502,7 +504,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
 	    log.logDebug(stats + " processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG
            // parse content
-            plasmaParser.document document;
+            plasmaParserDocument document;
            if (entry.scraper != null) {
                log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is pre-parsed by scraper");
                document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
@ -1397,4 +1399,18 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
        if (adminAccountBase64MD5.equals(serverCodings.standardCoder.encodeMD5Hex(authorization))) return 4; // hard-authenticated, all ok
        return 0; // wrong password
    }
    public void terminate() {
        this.terminate = true;
        this.shutdownSync.V();
    }
    public boolean isTerminated() {
        return this.terminate;
    }
    public boolean waitForShutdown() throws InterruptedException {
        this.shutdownSync.P();
        return this.terminate;
    }
 }
--- a/source/de/anomic/server/serverSemaphore.java
+++ b/source/de/anomic/server/serverSemaphore.java
@ -0,0 +1,104 @@
 //serverSemaphore.java 
 //------------------------
 //part of YaCy
 //(C) by Michael Peter Christen; mc@anomic.de
 //first published on http://www.anomic.de
 //Frankfurt, Germany, 2005
 //
 //this file is contributed by Martin Thelian
 //last major change: 24.04.2005
 //
 //This program is free software; you can redistribute it and/or modify
 //it under the terms of the GNU General Public License as published by
 //the Free Software Foundation; either version 2 of the License, or
 //(at your option) any later version.
 //
 //This program is distributed in the hope that it will be useful,
 //but WITHOUT ANY WARRANTY; without even the implied warranty of
 //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 //GNU General Public License for more details.
 //
 //You should have received a copy of the GNU General Public License
 //along with this program; if not, write to the Free Software
 //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 //
 //Using this software in any meaning (reading, learning, copying, compiling,
 //running) means that you agree that the Author(s) is (are) not responsible
 //for cost, loss of data or any harm that may be caused directly or indirectly
 //by usage of this softare or this documentation. The usage of this software
 //is on your own risk. The installation and usage (starting/running) of this
 //software may allow other people or application to access your computer and
 //any attached devices and is highly dependent on the configuration of the
 //software which must be done by the user of the software; the author(s) is
 //(are) also not responsible for proper configuration and usage of the
 //software, even if provoked by documentation provided together with
 //the software.
 //
 //Any changes to this file according to the GPL as documented in the file
 //gpl.txt aside this file in the shipment you received can be done to the
 //lines that follows this copyright notice here, but changes must not be
 //done inside the copyright notive above. A re-distribution must contain
 //the intact and unchanged copyright notice.
 //Contributions and changes to the program code must be marked as such.
 package de.anomic.server;
 public final class serverSemaphore  {
    private long currentValue = 0;    
    private long maximumValue = Long.MAX_VALUE;
    protected serverSemaphore()  {
        this(0,Long.MAX_VALUE);
    }
    public serverSemaphore(long initialValue)  {
        this(initialValue,Long.MAX_VALUE);
    }    
    protected serverSemaphore(long initialValue, long maxValue) {
        /* some errorhandling */
        if (maxValue < initialValue) {
            throw new IllegalArgumentException("The semaphore maximum value must not be " +
                                               "greater than the semaphore init value.");
        }
        if (maxValue < 1)  {
             throw new IllegalArgumentException("The semaphore maximum value must be greater or equal 1.");          
        }
        if (initialValue < 0) {
             throw new IllegalArgumentException("The semaphore initial value must be greater or equal 0.");          
        }
        // setting the initial Sempahore Values
        this.currentValue = initialValue;
        this.maximumValue = maxValue;        
    }
    public synchronized void P() throws InterruptedException
    {   
         this.currentValue-- ;           
         if (this.currentValue < 0) {    
             try  { 
                 wait();
             } catch(InterruptedException e) { 
                 this.currentValue++;
                 throw e;
             }
         }
    }
    public synchronized void V() {
        if (this.currentValue+1 == this.maximumValue) {
              throw new IndexOutOfBoundsException("The maximum value of the semaphore was reached");
        }        
        this.currentValue++;        
        if (this.currentValue <= 0) {
            notify();
        }   
     }
 }
--- a/source/yacy.java
+++ b/source/yacy.java
@ -79,7 +79,7 @@ public final class yacy {
    // static objects
    private static final String vString = "@REPL_VERSION@";
-    private static final String vDATE   = "@REPL_DATE@";
+    private static final String vDATE   = "20050422";
    private static final String copyright = "[ YACY Proxy v" + vString + ", build " + vDATE + " by Michael Christen / www.yacy.net ]";
    private static final String hline = "-------------------------------------------------------------------------------";
@ -237,10 +237,9 @@ public final class yacy {
 			serverSystem.openBrowser("http://localhost:" + port + "/" + browserPopUpPage, browserPopUpApplication);
 		    }
-		    // loop and wait
+            // wait for server shutdown
-		    while (!(sb.terminate)) try {
+			try {
-			Thread.currentThread().sleep(1000); // wait a while
+                sb.waitForShutdown();
                        // System.gc(); // prevent that we catch too much memory
 		    } catch (Exception e) {
                        serverLog.logError("MAIN CONTROL LOOP", "PANIK: " + e.getMessage());
                        e.printStackTrace();
@ -259,7 +258,7 @@ public final class yacy {
                    // idle until the processes are down
                    while (server.isAlive()) {
-			Thread.currentThread().sleep(2000); // wait a while
+						Thread.currentThread().sleep(2000); // wait a while
                    }
                    serverLog.logSystem("SHUTDOWN", "server has terminated");
                    sb.close();
--- a/yacy.parser
+++ b/yacy.parser
@ -0,0 +1 @@
 application/pdf=de.anomic.plasma.parser.pdf.pdfParser
		`@ -0,0 +1 @@`
							`application/pdf=de.anomic.plasma.parser.pdf.pdfParser`