*) adding an new package for extra content parsers

*) adding content parser for - pdf (using the pdf-box library) - doc (using the textmining.org library) *) adding a Interface for content parsers *) adding a configuration file which can be used to configure which parser is used for which mimeType *) Sempahore class was moved and renamed to serverSemaphore *) Changing yacy shutdown behaviour Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · 58b1a0ba40
parent 17d993cfee
commit 58b1a0ba40
14 changed files with 909 additions and 250 deletions
--- a/build.xml
+++ b/build.xml
@ -5,6 +5,7 @@
 	
 <property name="src" location="source"/>
 <property name="lib" location="lib"/>
+<property name="libx" location="libx"/>
 <property name="build" location="classes"/>
 <property name="htroot" location="htroot"/>

@ -39,6 +40,13 @@
        	<!-- libs needed for the yacy thread/object-pools -->
          	<pathelement location="${lib}/commons-collections.jar" />
        	<pathelement location="${lib}/commons-pool-1.2.jar" />        	
+        	
+        	<!-- libs needed to parse pdf files -->
+          	<pathelement location="${libx}/PDFBox-0.7.1.jar" />        	
+          	<pathelement location="${libx}/log4j-1.2.9.jar" />    
+        	
+        	<!-- libs needed for parsing doc files -->
+          	<pathelement location="${libx}/tm-extractors-0.4.jar" />         		
        </classpath>		
 	</javac>	
 	<javac srcdir="${htroot}/" destdir="${htroot}" classpath="${build}"/>
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -114,7 +114,7 @@ public class CacheAdmin_p {
                    else {
                        htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
                        OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
-                        plasmaParser.document document = switchboard.parser.transformScraper(url, "text/html", scraper);
+                        plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
                        serverFileUtils.copy(file, os);
                        info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>";
                        info += "<b>HREF:</b><br>" + formatAnchor(document.getHyperlinks()) + "<br>";
--- a/htroot/Steering.java
+++ b/htroot/Steering.java
@ -73,7 +73,7 @@ public class Steering {
        }
        
 	if (post.containsKey("shutdown")) {
-            switchboard.terminate = true;
+            switchboard.terminate();
            prop.put("info", 3);//shutting down
            return prop;
        }
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@ -0,0 +1,59 @@
+/*
+ * Check4Update is a stand-alone server application that can be used to 
+ * monitor various types of online resources for updates and changes and
+ * notifies the user if a modification was detected.
+ * 
+ * Copyright (C) 2005 Martin Thelian
+ * 
+ * This program is free software; you can redistribute it and/or modify 
+ * it under the terms of the GNU General Public License as published by 
+ * the Free Software Foundation; either version 2 of the License, or (at 
+ * your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful, but 
+ * WITHOUT ANY WARRANTY; without even the implied warranty of 
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License 
+ * along with this program; if not, write to the Free Software Foundation, 
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * 
+ * For more information, please email thelian@users.sourceforge.net
+ * 
+ */ 
+
+/* =======================================================================
+ * Revision Control Information
+ * $Source: $
+ * $Author: $
+ * $Date: $
+ * $Revision: $
+ * ======================================================================= */
+
+package de.anomic.plasma.parser;
+
+import java.io.File;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.HashSet;
+
+import de.anomic.plasma.plasmaParserDocument;
+
+public interface Parser {
+    
+    public plasmaParserDocument parse(URL location, String mimeType, byte[] source)
+    throws ParserException;
+    
+    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile)
+    throws ParserException;
+    
+    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) 
+    throws ParserException;
+            
+    public HashSet getSupportedMimeTypes();
+    
+    public void reset();
+    
+    
+}
--- a/source/de/anomic/plasma/parser/ParserException.java
+++ b/source/de/anomic/plasma/parser/ParserException.java
@ -0,0 +1,21 @@
+package de.anomic.plasma.parser;
+
+public class ParserException extends Exception
+{
+
+    public ParserException() {
+        super();
+    }
+
+    public ParserException(String message) {
+        super(message);
+    }
+
+    public ParserException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+    public ParserException(Throwable cause) {
+        super(cause);
+    }
+}
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@ -0,0 +1,127 @@
+/*
+ * Check4Update is a stand-alone server application that can be used to 
+ * monitor various types of online resources for updates and changes and
+ * notifies the user if a modification was detected.
+ * 
+ * Copyright (C) 2005 Martin Thelian
+ * 
+ * This program is free software; you can redistribute it and/or modify 
+ * it under the terms of the GNU General Public License as published by 
+ * the Free Software Foundation; either version 2 of the License, or (at 
+ * your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful, but 
+ * WITHOUT ANY WARRANTY; without even the implied warranty of 
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License 
+ * along with this program; if not, write to the Free Software Foundation, 
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * 
+ * For more information, please email thelian@users.sourceforge.net
+ * 
+ */ 
+
+/* =======================================================================
+ * Revision Control Information
+ * $Source: $
+ * $Author: $
+ * $Date: $
+ * $Revision: $
+ * ======================================================================= */
+
+package de.anomic.plasma.parser.doc;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import org.textmining.text.extraction.WordExtractor;
+
+
+import de.anomic.plasma.plasmaParserDocument;
+import de.anomic.plasma.parser.Parser;
+import de.anomic.plasma.parser.ParserException;
+
+public class docParser implements Parser {
+
+    /**
+     * a list of mime types that are supported by this parser class
+     */
+    public static final HashSet<String> SUPPORTED_MIME_TYPES = new HashSet<String>(Arrays.asList(new String[] {
+        new String("application/msword")
+    }));     
+    
+    
+	public docParser() {
+		super();
+	}
+
+	public plasmaParserDocument parse(URL location, String mimeType,
+			byte[] source) throws ParserException {
+        ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);
+        return this.parse(location,mimeType,contentInputStream);
+	}
+
+	public plasmaParserDocument parse(URL location, String mimeType,
+			File sourceFile) throws ParserException {
+        BufferedInputStream contentInputStream = null;
+        try {
+            contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
+        } catch (FileNotFoundException e) {
+            e.printStackTrace();
+        }
+        return this.parse(location, mimeType, contentInputStream);
+	}
+
+	public plasmaParserDocument parse(URL location, String mimeType,
+			InputStream source) throws ParserException {
+
+        
+		try {	
+			  WordExtractor extractor = new WordExtractor();
+			  String contents = extractor.extractText(source);
+
+              plasmaParserDocument theDoc = new plasmaParserDocument(
+                      location,
+                      mimeType,
+                      null,
+                      null,
+                      null,
+                      null,
+                      null,
+                      contents.getBytes(),
+                      null,
+                      null);
+              
+              return theDoc;             
+		}
+		catch (Exception e) {			
+			throw new ParserException("Unable to parse the doc content. " + e.getMessage());
+		}        
+	}
+
+	public HashSet getSupportedMimeTypes() {
+		return docParser.SUPPORTED_MIME_TYPES;
+	}
+
+	public void reset() {
+		// TODO Auto-generated method stub
+
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		// TODO Auto-generated method stub
+
+	}
+
+}
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@ -0,0 +1,119 @@
+package de.anomic.plasma.parser.pdf;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+
+
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.pdmodel.PDDocumentInformation;
+import org.pdfbox.util.PDFTextStripper;
+
+import de.anomic.plasma.plasmaParserDocument;
+import de.anomic.plasma.parser.Parser;
+import de.anomic.plasma.parser.ParserException;
+
+public class pdfParser implements Parser
+{
+
+    /**
+     * a list of mime types that are supported by this parser class
+     */
+    public static final HashSet<String> SUPPORTED_MIME_TYPES = new HashSet<String>(Arrays.asList(new String[] {
+        new String("application/pdf")
+    }));    
+    
+    public pdfParser() {
+        super();
+    }
+    
+    public HashSet getSupportedMimeTypes() {
+        return SUPPORTED_MIME_TYPES;
+    }
+    
+    public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException {
+        BufferedInputStream contentInputStream = null;
+        try {
+            contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
+        } catch (FileNotFoundException e) {
+            e.printStackTrace();
+        }
+        return this.parse(location, mimeType, contentInputStream);
+    }
+
+    public plasmaParserDocument parse(URL location, String mimeType, byte[] source) throws ParserException {
+        ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);
+        return this.parse(location,mimeType,contentInputStream);
+    }    
+    
+    public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
+        
+        try {       
+            String docTitle = null, docSubject = null, docAuthor = null, docKeyWords = null;
+            
+            PDFParser parser = new PDFParser(source);
+            parser.parse();
+            
+            PDFTextStripper stripper = new PDFTextStripper();
+            PDDocument theDocument = parser.getPDDocument();
+                              
+            PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
+            
+            if (theDocInfo != null)
+            {
+                docTitle = theDocInfo.getTitle();
+                docSubject = theDocInfo.getSubject();
+                docAuthor = theDocInfo.getAuthor();
+                docKeyWords = theDocInfo.getKeywords();
+            }
+            
+            ByteArrayOutputStream out = new ByteArrayOutputStream();
+            OutputStreamWriter writer = new OutputStreamWriter( out );            
+            stripper.writeText(theDocument, writer );
+            
+            writer.close();
+            theDocument.close();
+            
+            byte[] contents = out.toByteArray();
+			
+            /*
+             *         public document(URL location, String mimeType,
+                            String keywords, String shortTitle, String longTitle,
+                            String[] sections, String abstrct,
+                            byte[] text, Map anchors, Map images) {
+             * 
+             */            
+            plasmaParserDocument theDoc = new plasmaParserDocument(
+                    location,
+                    mimeType,
+                    docKeyWords,
+                    docSubject,
+                    docTitle,
+                    null,
+                    null,
+                    contents,
+                    null,
+                    null);
+            
+            return theDoc;
+        }
+        catch (Exception e) {            
+            throw new ParserException("Unable to parse the pdf content. " + e.getMessage());
+        }        
+    }
+    
+    public void reset() {
+    	// TODO Auto-generated method stub
+    	
+    }
+
+}
--- a/source/de/anomic/plasma/plasmaCrawlLoader.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoader.java
@ -192,75 +192,14 @@ public final class plasmaCrawlLoader extends Thread {
 }


-
-final class Semaphore  {
-   private long currentValue = 0;    
-   private long maximumValue = Long.MAX_VALUE;
-    
-   protected Semaphore()  {
-       this(0,Long.MAX_VALUE);
-   }
-   
-   public Semaphore(long initialValue)  {
-       this(initialValue,Long.MAX_VALUE);
-   }    
-
-   protected Semaphore(long initialValue, long maxValue) {
-       /* some errorhandling */
-       if (maxValue < initialValue) {
-           throw new IllegalArgumentException("The semaphore maximum value must not be " +
-                                              "greater than the semaphore init value.");
-       }
-       
-       if (maxValue < 1)  {
-            throw new IllegalArgumentException("The semaphore maximum value must be greater or equal 1.");          
-       }
-       
-       if (initialValue < 0) {
-            throw new IllegalArgumentException("The semaphore initial value must be greater or equal 0.");          
-       }
-       
-       
-       // setting the initial Sempahore Values
-       this.currentValue = initialValue;
-       this.maximumValue = maxValue;        
-   }
-   
-   public synchronized void P() throws InterruptedException
-   {   
-        this.currentValue-- ;           
-       
-        if (this.currentValue < 0) {    
-            try  { 
-                wait();
-            } catch(InterruptedException e) { 
-                this.currentValue++;
-                throw e;
-            }
-        }
-   }
-
-   public synchronized void V() {
-       if (this.currentValue+1 == this.maximumValue) {
-             throw new IndexOutOfBoundsException("The maximum value of the semaphore was reached");
-       }        
-       
-       this.currentValue++;        
-        
-       if (this.currentValue <= 0) {
-           notify();
-       }   
-    }
-}
-
 class CrawlerMessageQueue {
-    private final Semaphore readSync;
-    private final Semaphore writeSync;
+    private final serverSemaphore readSync;
+    private final serverSemaphore writeSync;
    private final ArrayList messageList;
    
    public CrawlerMessageQueue()  {
-        this.readSync  = new Semaphore (0);
-        this.writeSync = new Semaphore (1);
+        this.readSync  = new serverSemaphore (0);
+        this.writeSync = new serverSemaphore (1);
        
        this.messageList = new ArrayList(10);        
    }
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -45,52 +45,155 @@ package de.anomic.plasma;
 import java.io.*;
 import java.net.*;
 import java.util.*;
-import de.anomic.server.*;
+
+import org.apache.commons.pool.KeyedPoolableObjectFactory;
+import org.apache.commons.pool.impl.GenericKeyedObjectPool;
+import org.apache.commons.pool.impl.GenericObjectPool;
+
+import de.anomic.plasma.parser.Parser;
+import de.anomic.server.serverFileUtils;
 import de.anomic.htmlFilter.*;

-public class plasmaParser {
+public final class plasmaParser {
    
    public static String mediaExt =
        "swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
-        "sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj";
+        "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj";
    
+    private final Properties parserList;
+
+	private final plasmaParserPool theParserPool;

    public plasmaParser(File parserDispatcherPropertyFile) {
-        // this is only a dummy yet because we have only one parser...
        
+        // loading a list of availabe parser from file
+    	Properties prop = new Properties();
+    	try {
+    	    prop.load(new FileInputStream(parserDispatcherPropertyFile));
+    	} catch (IOException e) {
+    	    System.err.println("ERROR: " + parserDispatcherPropertyFile.toString() + " not found in settings path");
+    	}    	
+        this.parserList = prop;
+        
+        /* 
+         * initializing the parser object pool
+         */
+        GenericKeyedObjectPool.Config config = new GenericKeyedObjectPool.Config();
+        
+        // The maximum number of active connections that can be allocated from pool at the same time,
+        // 0 for no limit
+        config.maxActive = 0;
+        
+        // The maximum number of idle connections connections in the pool
+        // 0 = no limit.        
+        config.maxIdle = 10;    
+        
+        config.whenExhaustedAction = GenericObjectPool.WHEN_EXHAUSTED_BLOCK; 
+        config.minEvictableIdleTimeMillis = 30000; 
+        
+        this.theParserPool = new plasmaParserPool(new plasmaParserFactory(),config);           
+        
+        /* testing if all parsers could be loaded properly.
+         * This is done now to avoid surprises at runtime. */
+        if (this.parserList.size() > 0) {
+			Iterator parserIterator = this.parserList.values().iterator();
+            while (parserIterator.hasNext()) {
+				String className = (String) parserIterator.next();
+                try {
+					Class.forName(className);
+                } catch (Exception e) {
+                    // if we could not load the parser we remove it from the parser list ...
+                    this.parserList.remove(className);
+                }
+            }
+        }        
    }
    
    public void close() {
-        // frees resources; does nothing yet
-    }
+        // release resources 
+        try {        
+	        // clearing the parser list
+	        this.parserList.clear();
+	        
+	        // closing the parser object pool
+	        this.theParserPool.close();
+        } catch (Exception e) {
+            //
+        }
+    }    
    
-    public document parseSource(URL location, String mimeType, byte[] source) {
-        // make a scraper and transformer
-        htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
-        OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
+    public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) {
+        
+        Parser theParser = null;
        try {
+            
+            if ((mimeType != null) && (mimeType.indexOf(";") != -1)) {
+                mimeType = mimeType.substring(0,mimeType.indexOf(";"));
+            }                        
+            
+            // getting the correct parser for the given mimeType
+            theParser = this.getParser(mimeType);
+            
+            // if a parser was found we use it ...
+            if (theParser != null) {
+                return theParser.parse(location, mimeType,source);
+            }
+        
+            // ...otherwise we make a html scraper and transformer
+            htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
+            OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
+
            hfos.write(source);
            return transformScraper(location, mimeType, scraper);
-        } catch (IOException e) {
+        } catch (Exception e) {
            return null;
+        } finally {
+            if (theParser != null) {
+                try {
+                    this.theParserPool.returnObject(mimeType, theParser);
+                } catch (Exception e) {
+                }
+            }
        }
    }

-    public document parseSource(URL location, String mimeType, File sourceFile) {
-        // make a scraper and transformer
-        htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
-        OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
+    public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) {
+
+        Parser theParser = null;
        try {
-	    serverFileUtils.copy(sourceFile, hfos);
+            if ((mimeType != null) && (mimeType.indexOf(";") != -1)) {
+                mimeType = mimeType.substring(0,mimeType.indexOf(";"));
+            }            
+            
+            // getting the correct parser for the given mimeType
+            theParser = this.getParser(mimeType);
+            
+            // if a parser was found we use it ...
+            if (theParser != null) {
+                return theParser.parse(location, mimeType,sourceFile);
+            }    
+            
+            // ...otherwise we make a scraper and transformer
+            htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
+            OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
+            
+			serverFileUtils.copy(sourceFile, hfos);
            return transformScraper(location, mimeType, scraper);
-        } catch (IOException e) {
+        } catch (Exception e) {
            return null;
+        } finally {
+            if (theParser != null) {
+                try {
+                    this.theParserPool.returnObject(mimeType, theParser);
+                } catch (Exception e) {
+                }
+            }
        }
    }
    
-    public document transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
+    public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
        try {
-            return new document(new URL(urlNormalform(location)),
+            return new plasmaParserDocument(new URL(urlNormalform(location)),
                                mimeType, null, null, scraper.getHeadline(),
                                null, null,
                                scraper.getText(), scraper.getAnchors(), scraper.getImages());
@ -99,6 +202,41 @@ public class plasmaParser {
        }
    }
    
+    /**
+     * This function is used to determine the parser class that should be used for a given
+     * mimetype ...
+     * @param mimeType
+     * @return
+     */
+    public Parser getParser(String mimeType) {
+        
+        if (mimeType == null) {            
+            // TODO: do automatic mimetype detection
+            return null;
+        }
+        
+        try {
+	        if (this.parserList.containsKey(mimeType)) {
+	            String parserClassName = (String)this.parserList.get(mimeType);
+	            
+                // fetching a new parser object from pool  
+				Parser theParser = (Parser) this.theParserPool.borrowObject(parserClassName);
+                
+                // checking if the created parser really supports the given mimetype 
+                HashSet supportedMimeTypes = theParser.getSupportedMimeTypes();
+                if ((supportedMimeTypes != null) && (supportedMimeTypes.contains(mimeType))) {
+					return theParser;
+                }
+                this.theParserPool.returnObject(parserClassName,theParser);
+	        }
+        } catch (Exception e) {
+            System.err.println("ERROR: Unable to load the correct parser for type " + mimeType);
+        }
+        
+        return null;
+        
+    }
+    
    public static String urlNormalform(URL url) {
        if (url == null) return null;
        return urlNormalform(url.toString());
@ -114,160 +252,7 @@ public class plasmaParser {
        return us;
    }   
    
-    public class document {
-        
-        URL location;       // the source url
-        String mimeType;    // mimeType as taken from http header
-        String keywords;    // most resources provide a keyword field
-        String shortTitle;  // a shortTitle mostly appears in the window header (border)
-        String longTitle;   // the real title of the document, commonly h1-tags
-        String[] sections;  // if present: more titles/headlines appearing in the document
-        String abstrct;     // an abstract, if present: short content description
-        byte[] text;        // the clear text, all that is visible
-        Map anchors;        // all links embedded as clickeable entities (anchor tags)
-        Map images;         // all visible pictures in document
-        // the anchors and images - Maps are URL-to-EntityDescription mappings.
-        // The EntityDescription appear either as visible text in anchors or as alternative
-        // text in image tags.
-        Map hyperlinks;
-        Map medialinks;
-        Map emaillinks;
-                        
-        public document(URL location, String mimeType,
-                        String keywords, String shortTitle, String longTitle,
-                        String[] sections, String abstrct,
-                        byte[] text, Map anchors, Map images) {
-            this.location = location;
-            this.mimeType = mimeType;
-            this.keywords = keywords;
-            this.shortTitle = shortTitle;
-            this.longTitle = longTitle;
-            this.sections = sections;
-            this.abstrct = abstrct;
-            this.text = text;
-            this.anchors = anchors;
-            this.images = images;
-            this.hyperlinks = null;
-            this.medialinks = null;
-            this.emaillinks = null;
-        }
-        
-        private String absolutePath(String relativePath) {
-            try {
-                return urlNormalform(new URL(location, relativePath));
-            } catch (Exception e) {
-                return "";
-            }
-        }
-        
-        public String getMainShortTitle() {
-            if (shortTitle != null) return shortTitle; else return longTitle;
-        }
-        
-        public String getMainLongTitle() {
-            if (longTitle != null) return longTitle; else return shortTitle;
-        }
-        
-        public String[] getSectionTitles() {
-            if (sections != null) return sections; else return new String[]{getMainLongTitle()};
-        }
-
-        public String getAbstract() {
-            if (abstrct != null) return abstrct; else return getMainLongTitle();
-        }
-        
-        public byte[] getText() {
-            // returns only the clear (visible) text (not the source data)
-            return text;
-        }
-        
-        public Map getAnchors() {
-            // returns all links embedded as anchors (clickeable entities)
-            return anchors;
-        }
-        
-        public Map getImages() {
-            // returns all links enbedded as pictures (visible iin document)
-            return images;
-        }
-        
-        // the next three methods provide a calculated view on the getAnchors/getImages:
-        
-        public Map getHyperlinks() {
-            // this is a subset of the getAnchor-set: only links to other hyperrefs
-            if (hyperlinks == null) resortLinks();
-            return hyperlinks;
-        }
-        
-        public Map getMedialinks() {
-            // this is partly subset of getAnchor and getImage: all non-hyperrefs
-            if (medialinks == null) resortLinks();
-            return medialinks;
-        }
-        
-        public Map getEmaillinks() {
-            // this is part of the getAnchor-set: only links to email addresses
-            if (emaillinks == null) resortLinks();
-            return emaillinks;
-        }
-        
-        private synchronized void resortLinks() {
-            Iterator i;
-            String url;
-            int extpos;
-            String ext;
-            i = anchors.entrySet().iterator();
-            hyperlinks = new HashMap();
-            medialinks = new HashMap();
-            emaillinks = new HashMap();
-            Map.Entry entry;
-            while (i.hasNext()) {
-                entry = (Map.Entry) i.next();
-                url = (String) entry.getKey();
-                if ((url != null) && (url.startsWith("mailto:"))) {
-                    emaillinks.put(url.substring(7), entry.getValue());
-                } else {
-                    extpos = url.lastIndexOf(".");
-                    String normal;
-                    if (extpos > 0) {
-                        ext = url.substring(extpos).toLowerCase();
-                        normal = urlNormalform(url);
-                        if (normal != null) {
-                            if (mediaExt.indexOf(ext.substring(1)) >= 0) {
-                                // this is not an normal anchor, its a media link
-                                medialinks.put(normal, entry.getValue());
-                            } else {
-                                hyperlinks.put(normal, entry.getValue());
-                            }
-                        }
-                    }
-                }
-            }
-            // finally add the images to the medialinks
-            i = images.entrySet().iterator();
-            String normal;
-            while (i.hasNext()) {
-                entry = (Map.Entry) i.next();
-                url = (String) entry.getKey();
-                normal = urlNormalform(url);
-                if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
-            }
-            expandHyperlinks();
-        }
-        
-        
-        public synchronized void expandHyperlinks() {
-            // we add artificial hyperlinks to the hyperlink set that can be calculated from
-            // given hyperlinks and imagelinks
-            hyperlinks.putAll(allReflinks(hyperlinks));
-            hyperlinks.putAll(allReflinks(medialinks));
-            hyperlinks.putAll(allSubpaths(hyperlinks));
-            hyperlinks.putAll(allSubpaths(medialinks));
-        }
-        
-    }
-    
-    private static Map allReflinks(Map links) {
+    static Map allReflinks(Map links) {
        // we find all links that are part of a reference inside a url
        HashMap v = new HashMap();
        Iterator i = links.keySet().iterator();
@ -293,7 +278,7 @@ public class plasmaParser {
        return v;
    }
    
-    private static Map allSubpaths(Map links) {
+    static Map allSubpaths(Map links) {
        HashMap v = new HashMap();
        Iterator i = links.keySet().iterator();
        String s;
@ -312,4 +297,93 @@ public class plasmaParser {
        return v;
    }
    
+    public static void main(String[] args) {
+		try {            
+			plasmaParser theParser = new plasmaParser(new File("yacy.parser"));
+            FileInputStream theInput = new FileInputStream(new File("Y:/public_html/test.pdf"));
+			ByteArrayOutputStream theOutput = new ByteArrayOutputStream();
+            serverFileUtils.copy(theInput, theOutput);
+            
+            theParser.parseSource(new URL("http://brain"),"application/pdf",theOutput.toByteArray());
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+	}
+    
 }
+
+final class plasmaParserFactory implements KeyedPoolableObjectFactory {
+    
+    public plasmaParserFactory() {
+        super();  
+    }
+    
+    /**
+     * @see org.apache.commons.pool.PoolableObjectFactory#makeObject()
+     */
+    public Object makeObject(Object key) throws Exception {
+        
+        if (!(key instanceof String))
+            throw new IllegalArgumentException("The object key must be of type string.");
+        
+        Class moduleClass = Class.forName((String)key);
+        return moduleClass.newInstance();
+    }          
+    
+     /**
+     * @see org.apache.commons.pool.PoolableObjectFactory#destroyObject(java.lang.Object)
+     */
+    public void destroyObject(Object key, Object obj) {
+        if (obj instanceof Parser) {
+            Parser theParser = (Parser) obj;
+        }
+    }
+    
+    /**
+     * @see org.apache.commons.pool.PoolableObjectFactory#validateObject(java.lang.Object)
+     */
+    public boolean validateObject(Object key, Object obj) {
+        if (obj instanceof Parser) {
+            Parser theParser = (Parser) obj;
+            return true;
+        }
+        return true;
+    }
+    
+    /**
+     * @param obj 
+     * 
+     */
+    public void activateObject(Object key, Object obj)  {
+        //log.debug(" activateObject...");
+    }
+
+    /**
+     * @param obj 
+     * 
+     */
+    public void passivateObject(Object key, Object obj) { 
+        //log.debug(" passivateObject..." + obj);
+        if (obj instanceof Parser)  {
+            Parser theParser = (Parser) obj;
+            theParser.reset();
+        }
+    }
+}    
+
+final class plasmaParserPool extends GenericKeyedObjectPool {
+
+    public plasmaParserPool(plasmaParserFactory objFactory,
+            GenericKeyedObjectPool.Config config) {
+        super(objFactory, config);
+    }
+    
+
+    public Object borrowObject(Object key) throws Exception  {
+       return super.borrowObject(key);
+    }
+
+    public void returnObject(Object key, Object borrowed) throws Exception  {
+        super.returnObject(key,borrowed);
+    }        
+}   
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -0,0 +1,192 @@
+/*
+ * Check4Update is a stand-alone server application that can be used to 
+ * monitor various types of online resources for updates and changes and
+ * notifies the user if a modification was detected.
+ * 
+ * Copyright (C) 2005 Martin Thelian
+ * 
+ * This program is free software; you can redistribute it and/or modify 
+ * it under the terms of the GNU General Public License as published by 
+ * the Free Software Foundation; either version 2 of the License, or (at 
+ * your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful, but 
+ * WITHOUT ANY WARRANTY; without even the implied warranty of 
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License 
+ * along with this program; if not, write to the Free Software Foundation, 
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * 
+ * For more information, please email thelian@users.sourceforge.net
+ * 
+ */ 
+
+/* =======================================================================
+ * Revision Control Information
+ * $Source: $
+ * $Author: $
+ * $Date: $
+ * $Revision: $
+ * ======================================================================= */
+
+package de.anomic.plasma;
+
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+public class plasmaParserDocument {
+    
+    URL location;       // the source url
+    String mimeType;    // mimeType as taken from http header
+    String keywords;    // most resources provide a keyword field
+    String shortTitle;  // a shortTitle mostly appears in the window header (border)
+    String longTitle;   // the real title of the document, commonly h1-tags
+    String[] sections;  // if present: more titles/headlines appearing in the document
+    String abstrct;     // an abstract, if present: short content description
+    byte[] text;        // the clear text, all that is visible
+    Map anchors;        // all links embedded as clickeable entities (anchor tags)
+    Map images;         // all visible pictures in document
+    // the anchors and images - Maps are URL-to-EntityDescription mappings.
+    // The EntityDescription appear either as visible text in anchors or as alternative
+    // text in image tags.
+    Map hyperlinks;
+    Map medialinks;
+    Map emaillinks;
+                    
+    public plasmaParserDocument(URL location, String mimeType,
+                    String keywords, String shortTitle, String longTitle,
+                    String[] sections, String abstrct,
+                    byte[] text, Map anchors, Map images) {
+        this.location = location;
+        this.mimeType = mimeType;
+        this.keywords = keywords;
+        this.shortTitle = shortTitle;
+        this.longTitle = longTitle;
+        this.sections = sections;
+        this.abstrct = abstrct;
+        this.text = text;
+        this.anchors = (anchors==null)?new HashMap():anchors;
+        this.images = (images==null)?new HashMap():images;
+        this.hyperlinks = null;
+        this.medialinks = null;
+        this.emaillinks = null;
+    }
+    
+    private String absolutePath(String relativePath) {
+        try {
+            return plasmaParser.urlNormalform(new URL(location, relativePath));
+        } catch (Exception e) {
+            return "";
+        }
+    }
+    
+    public String getMainShortTitle() {
+        if (shortTitle != null) return shortTitle; else return longTitle;
+    }
+    
+    public String getMainLongTitle() {
+        if (longTitle != null) return longTitle; else return shortTitle;
+    }
+    
+    public String[] getSectionTitles() {
+        if (sections != null) return sections; else return new String[]{getMainLongTitle()};
+    }
+
+    public String getAbstract() {
+        if (abstrct != null) return abstrct; else return getMainLongTitle();
+    }
+    
+    public byte[] getText() {
+        // returns only the clear (visible) text (not the source data)
+        return text;
+    }
+    
+    public Map getAnchors() {
+        // returns all links embedded as anchors (clickeable entities)
+        return anchors;
+    }
+    
+    public Map getImages() {
+        // returns all links enbedded as pictures (visible iin document)
+        return images;
+    }
+    
+    // the next three methods provide a calculated view on the getAnchors/getImages:
+    
+    public Map getHyperlinks() {
+        // this is a subset of the getAnchor-set: only links to other hyperrefs
+        if (hyperlinks == null) resortLinks();
+        return hyperlinks;
+    }
+    
+    public Map getMedialinks() {
+        // this is partly subset of getAnchor and getImage: all non-hyperrefs
+        if (medialinks == null) resortLinks();
+        return medialinks;
+    }
+    
+    public Map getEmaillinks() {
+        // this is part of the getAnchor-set: only links to email addresses
+        if (emaillinks == null) resortLinks();
+        return emaillinks;
+    }
+    
+    private synchronized void resortLinks() {
+        Iterator i;
+        String url;
+        int extpos;
+        String ext;
+        i = anchors.entrySet().iterator();
+        hyperlinks = new HashMap();
+        medialinks = new HashMap();
+        emaillinks = new HashMap();
+        Map.Entry entry;
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            url = (String) entry.getKey();
+            if ((url != null) && (url.startsWith("mailto:"))) {
+                emaillinks.put(url.substring(7), entry.getValue());
+            } else {
+                extpos = url.lastIndexOf(".");
+                String normal;
+                if (extpos > 0) {
+                    ext = url.substring(extpos).toLowerCase();
+                    normal = plasmaParser.urlNormalform(url);
+                    if (normal != null) {
+                        if (plasmaParser.mediaExt.indexOf(ext.substring(1)) >= 0) {
+                            // this is not an normal anchor, its a media link
+                            medialinks.put(normal, entry.getValue());
+                        } else {
+                            hyperlinks.put(normal, entry.getValue());
+                        }
+                    }
+                }
+            }
+        }
+        // finally add the images to the medialinks
+        i = images.entrySet().iterator();
+        String normal;
+        while (i.hasNext()) {
+            entry = (Map.Entry) i.next();
+            url = (String) entry.getKey();
+            normal = plasmaParser.urlNormalform(url);
+            if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
+        }
+        expandHyperlinks();
+    }
+    
+    
+    public synchronized void expandHyperlinks() {
+        // we add artificial hyperlinks to the hyperlink set that can be calculated from
+        // given hyperlinks and imagelinks
+        hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));
+        hyperlinks.putAll(plasmaParser.allReflinks(medialinks));
+        hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));
+        hyperlinks.putAll(plasmaParser.allSubpaths(medialinks));
+    }
+    
+}
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -149,7 +149,9 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
    public  kelondroTables         facilityDB;
    public  plasmaParser           parser;
    public  int                    serverJobs;
-    public boolean terminate = false;
+    
+    private serverSemaphore shutdownSync = new serverSemaphore(0);
+    private boolean terminate = false;
    
    public plasmaSwitchboard(String rootPath, String initPath, String configPath) throws IOException {
 	super(rootPath, initPath, configPath);
@ -207,7 +209,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
        initProfiles();
        
        // make parser
-        parser = new plasmaParser(new File(""));
+        parser = new plasmaParser(new File("yacy.parser"));
        
        // start indexing management
        loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
@ -502,7 +504,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
 	    log.logDebug(stats + " processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG

            // parse content
-            plasmaParser.document document;
+            plasmaParserDocument document;
            if (entry.scraper != null) {
                log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is pre-parsed by scraper");
                document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
@ -1397,4 +1399,18 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
        if (adminAccountBase64MD5.equals(serverCodings.standardCoder.encodeMD5Hex(authorization))) return 4; // hard-authenticated, all ok
        return 0; // wrong password
    }
+    
+    public void terminate() {
+        this.terminate = true;
+        this.shutdownSync.V();
+    }
+    
+    public boolean isTerminated() {
+        return this.terminate;
+    }
+    
+    public boolean waitForShutdown() throws InterruptedException {
+        this.shutdownSync.P();
+        return this.terminate;
+    }
 }
--- a/source/de/anomic/server/serverSemaphore.java
+++ b/source/de/anomic/server/serverSemaphore.java
@ -0,0 +1,104 @@
+//serverSemaphore.java 
+//------------------------
+//part of YaCy
+//(C) by Michael Peter Christen; mc@anomic.de
+//first published on http://www.anomic.de
+//Frankfurt, Germany, 2005
+//
+//this file is contributed by Martin Thelian
+//last major change: 24.04.2005
+//
+//This program is free software; you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation; either version 2 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program; if not, write to the Free Software
+//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+//Using this software in any meaning (reading, learning, copying, compiling,
+//running) means that you agree that the Author(s) is (are) not responsible
+//for cost, loss of data or any harm that may be caused directly or indirectly
+//by usage of this softare or this documentation. The usage of this software
+//is on your own risk. The installation and usage (starting/running) of this
+//software may allow other people or application to access your computer and
+//any attached devices and is highly dependent on the configuration of the
+//software which must be done by the user of the software; the author(s) is
+//(are) also not responsible for proper configuration and usage of the
+//software, even if provoked by documentation provided together with
+//the software.
+//
+//Any changes to this file according to the GPL as documented in the file
+//gpl.txt aside this file in the shipment you received can be done to the
+//lines that follows this copyright notice here, but changes must not be
+//done inside the copyright notive above. A re-distribution must contain
+//the intact and unchanged copyright notice.
+//Contributions and changes to the program code must be marked as such.
+
+package de.anomic.server;
+
+public final class serverSemaphore  {
+    private long currentValue = 0;    
+    private long maximumValue = Long.MAX_VALUE;
+     
+    protected serverSemaphore()  {
+        this(0,Long.MAX_VALUE);
+    }
+    
+    public serverSemaphore(long initialValue)  {
+        this(initialValue,Long.MAX_VALUE);
+    }    
+
+    protected serverSemaphore(long initialValue, long maxValue) {
+        /* some errorhandling */
+        if (maxValue < initialValue) {
+            throw new IllegalArgumentException("The semaphore maximum value must not be " +
+                                               "greater than the semaphore init value.");
+        }
+        
+        if (maxValue < 1)  {
+             throw new IllegalArgumentException("The semaphore maximum value must be greater or equal 1.");          
+        }
+        
+        if (initialValue < 0) {
+             throw new IllegalArgumentException("The semaphore initial value must be greater or equal 0.");          
+        }
+        
+        
+        // setting the initial Sempahore Values
+        this.currentValue = initialValue;
+        this.maximumValue = maxValue;        
+    }
+    
+    public synchronized void P() throws InterruptedException
+    {   
+         this.currentValue-- ;           
+        
+         if (this.currentValue < 0) {    
+             try  { 
+                 wait();
+             } catch(InterruptedException e) { 
+                 this.currentValue++;
+                 throw e;
+             }
+         }
+    }
+
+    public synchronized void V() {
+        if (this.currentValue+1 == this.maximumValue) {
+              throw new IndexOutOfBoundsException("The maximum value of the semaphore was reached");
+        }        
+        
+        this.currentValue++;        
+         
+        if (this.currentValue <= 0) {
+            notify();
+        }   
+     }
+ }
--- a/source/yacy.java
+++ b/source/yacy.java
@ -79,7 +79,7 @@ public final class yacy {

    // static objects
    private static final String vString = "@REPL_VERSION@";
-    private static final String vDATE   = "@REPL_DATE@";
+    private static final String vDATE   = "20050422";
    private static final String copyright = "[ YACY Proxy v" + vString + ", build " + vDATE + " by Michael Christen / www.yacy.net ]";
    private static final String hline = "-------------------------------------------------------------------------------";
    
@ -237,10 +237,9 @@ public final class yacy {
 			serverSystem.openBrowser("http://localhost:" + port + "/" + browserPopUpPage, browserPopUpApplication);
 		    }

-		    // loop and wait
-		    while (!(sb.terminate)) try {
-			Thread.currentThread().sleep(1000); // wait a while
-                        // System.gc(); // prevent that we catch too much memory
+            // wait for server shutdown
+			try {
+                sb.waitForShutdown();
 		    } catch (Exception e) {
                        serverLog.logError("MAIN CONTROL LOOP", "PANIK: " + e.getMessage());
                        e.printStackTrace();
@ -259,7 +258,7 @@ public final class yacy {
                    
                    // idle until the processes are down
                    while (server.isAlive()) {
-			Thread.currentThread().sleep(2000); // wait a while
+						Thread.currentThread().sleep(2000); // wait a while
                    }
                    serverLog.logSystem("SHUTDOWN", "server has terminated");
                    sb.close();
--- a/yacy.parser
+++ b/yacy.parser
@ -0,0 +1 @@
+application/pdf=de.anomic.plasma.parser.pdf.pdfParser
				`@ -0,0 +1 @@`
				`application/pdf=de.anomic.plasma.parser.pdf.pdfParser`