*) adding an new package for extra content parsers

*) adding content parser for
- pdf (using the pdf-box library)
- doc (using the textmining.org library)
*) adding a Interface for content parsers
*) adding a configuration file which can be used to configure which parser is used for which mimeType
*) Sempahore class was moved and renamed to serverSemaphore
*) Changing yacy shutdown behaviour
Buzy waiting loop for shutdown was removed and replaced with a blocking call (using the semaphore class mentioned above) to the new switchboard.waitForShutdown method.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@46 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent 17d993cfee
commit 58b1a0ba40

@ -5,6 +5,7 @@
<property name="src" location="source"/>
<property name="lib" location="lib"/>
<property name="libx" location="libx"/>
<property name="build" location="classes"/>
<property name="htroot" location="htroot"/>
@ -39,6 +40,13 @@
<!-- libs needed for the yacy thread/object-pools -->
<pathelement location="${lib}/commons-collections.jar" />
<pathelement location="${lib}/commons-pool-1.2.jar" />
<!-- libs needed to parse pdf files -->
<pathelement location="${libx}/PDFBox-0.7.1.jar" />
<pathelement location="${libx}/log4j-1.2.9.jar" />
<!-- libs needed for parsing doc files -->
<pathelement location="${libx}/tm-extractors-0.4.jar" />
</classpath>
</javac>
<javac srcdir="${htroot}/" destdir="${htroot}" classpath="${build}"/>

@ -114,7 +114,7 @@ public class CacheAdmin_p {
else {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(url);
OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
plasmaParser.document document = switchboard.parser.transformScraper(url, "text/html", scraper);
plasmaParserDocument document = switchboard.parser.transformScraper(url, "text/html", scraper);
serverFileUtils.copy(file, os);
info += "<b>HEADLINE:</b><br>" + scraper.getHeadline() + "<br><br>";
info += "<b>HREF:</b><br>" + formatAnchor(document.getHyperlinks()) + "<br>";

@ -73,7 +73,7 @@ public class Steering {
}
if (post.containsKey("shutdown")) {
switchboard.terminate = true;
switchboard.terminate();
prop.put("info", 3);//shutting down
return prop;
}

@ -0,0 +1,59 @@
/*
* Check4Update is a stand-alone server application that can be used to
* monitor various types of online resources for updates and changes and
* notifies the user if a modification was detected.
*
* Copyright (C) 2005 Martin Thelian
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* For more information, please email thelian@users.sourceforge.net
*
*/
/* =======================================================================
* Revision Control Information
* $Source: $
* $Author: $
* $Date: $
* $Revision: $
* ======================================================================= */
package de.anomic.plasma.parser;
import java.io.File;
import java.io.InputStream;
import java.net.URL;
import java.util.HashSet;
import de.anomic.plasma.plasmaParserDocument;
public interface Parser {
public plasmaParserDocument parse(URL location, String mimeType, byte[] source)
throws ParserException;
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile)
throws ParserException;
public plasmaParserDocument parse(URL location, String mimeType, InputStream source)
throws ParserException;
public HashSet getSupportedMimeTypes();
public void reset();
}

@ -0,0 +1,21 @@
package de.anomic.plasma.parser;
public class ParserException extends Exception
{
public ParserException() {
super();
}
public ParserException(String message) {
super(message);
}
public ParserException(String message, Throwable cause) {
super(message, cause);
}
public ParserException(Throwable cause) {
super(cause);
}
}

@ -0,0 +1,127 @@
/*
* Check4Update is a stand-alone server application that can be used to
* monitor various types of online resources for updates and changes and
* notifies the user if a modification was detected.
*
* Copyright (C) 2005 Martin Thelian
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* For more information, please email thelian@users.sourceforge.net
*
*/
/* =======================================================================
* Revision Control Information
* $Source: $
* $Author: $
* $Date: $
* $Revision: $
* ======================================================================= */
package de.anomic.plasma.parser.doc;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import org.textmining.text.extraction.WordExtractor;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
public class docParser implements Parser {
/**
* a list of mime types that are supported by this parser class
*/
public static final HashSet<String> SUPPORTED_MIME_TYPES = new HashSet<String>(Arrays.asList(new String[] {
new String("application/msword")
}));
public docParser() {
super();
}
public plasmaParserDocument parse(URL location, String mimeType,
byte[] source) throws ParserException {
ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);
return this.parse(location,mimeType,contentInputStream);
}
public plasmaParserDocument parse(URL location, String mimeType,
File sourceFile) throws ParserException {
BufferedInputStream contentInputStream = null;
try {
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return this.parse(location, mimeType, contentInputStream);
}
public plasmaParserDocument parse(URL location, String mimeType,
InputStream source) throws ParserException {
try {
WordExtractor extractor = new WordExtractor();
String contents = extractor.extractText(source);
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
null,
null,
null,
null,
null,
contents.getBytes(),
null,
null);
return theDoc;
}
catch (Exception e) {
throw new ParserException("Unable to parse the doc content. " + e.getMessage());
}
}
public HashSet getSupportedMimeTypes() {
return docParser.SUPPORTED_MIME_TYPES;
}
public void reset() {
// TODO Auto-generated method stub
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
}

@ -0,0 +1,119 @@
package de.anomic.plasma.parser.pdf;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.util.PDFTextStripper;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
public class pdfParser implements Parser
{
/**
* a list of mime types that are supported by this parser class
*/
public static final HashSet<String> SUPPORTED_MIME_TYPES = new HashSet<String>(Arrays.asList(new String[] {
new String("application/pdf")
}));
public pdfParser() {
super();
}
public HashSet getSupportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException {
BufferedInputStream contentInputStream = null;
try {
contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return this.parse(location, mimeType, contentInputStream);
}
public plasmaParserDocument parse(URL location, String mimeType, byte[] source) throws ParserException {
ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source);
return this.parse(location,mimeType,contentInputStream);
}
public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException {
try {
String docTitle = null, docSubject = null, docAuthor = null, docKeyWords = null;
PDFParser parser = new PDFParser(source);
parser.parse();
PDFTextStripper stripper = new PDFTextStripper();
PDDocument theDocument = parser.getPDDocument();
PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
if (theDocInfo != null)
{
docTitle = theDocInfo.getTitle();
docSubject = theDocInfo.getSubject();
docAuthor = theDocInfo.getAuthor();
docKeyWords = theDocInfo.getKeywords();
}
ByteArrayOutputStream out = new ByteArrayOutputStream();
OutputStreamWriter writer = new OutputStreamWriter( out );
stripper.writeText(theDocument, writer );
writer.close();
theDocument.close();
byte[] contents = out.toByteArray();
/*
* public document(URL location, String mimeType,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, Map images) {
*
*/
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
docKeyWords,
docSubject,
docTitle,
null,
null,
contents,
null,
null);
return theDoc;
}
catch (Exception e) {
throw new ParserException("Unable to parse the pdf content. " + e.getMessage());
}
}
public void reset() {
// TODO Auto-generated method stub
}
}

@ -192,75 +192,14 @@ public final class plasmaCrawlLoader extends Thread {
}
final class Semaphore {
private long currentValue = 0;
private long maximumValue = Long.MAX_VALUE;
protected Semaphore() {
this(0,Long.MAX_VALUE);
}
public Semaphore(long initialValue) {
this(initialValue,Long.MAX_VALUE);
}
protected Semaphore(long initialValue, long maxValue) {
/* some errorhandling */
if (maxValue < initialValue) {
throw new IllegalArgumentException("The semaphore maximum value must not be " +
"greater than the semaphore init value.");
}
if (maxValue < 1) {
throw new IllegalArgumentException("The semaphore maximum value must be greater or equal 1.");
}
if (initialValue < 0) {
throw new IllegalArgumentException("The semaphore initial value must be greater or equal 0.");
}
// setting the initial Sempahore Values
this.currentValue = initialValue;
this.maximumValue = maxValue;
}
public synchronized void P() throws InterruptedException
{
this.currentValue-- ;
if (this.currentValue < 0) {
try {
wait();
} catch(InterruptedException e) {
this.currentValue++;
throw e;
}
}
}
public synchronized void V() {
if (this.currentValue+1 == this.maximumValue) {
throw new IndexOutOfBoundsException("The maximum value of the semaphore was reached");
}
this.currentValue++;
if (this.currentValue <= 0) {
notify();
}
}
}
class CrawlerMessageQueue {
private final Semaphore readSync;
private final Semaphore writeSync;
private final serverSemaphore readSync;
private final serverSemaphore writeSync;
private final ArrayList messageList;
public CrawlerMessageQueue() {
this.readSync = new Semaphore (0);
this.writeSync = new Semaphore (1);
this.readSync = new serverSemaphore (0);
this.writeSync = new serverSemaphore (1);
this.messageList = new ArrayList(10);
}

@ -45,52 +45,155 @@ package de.anomic.plasma;
import java.io.*;
import java.net.*;
import java.util.*;
import de.anomic.server.*;
import org.apache.commons.pool.KeyedPoolableObjectFactory;
import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.plasma.parser.Parser;
import de.anomic.server.serverFileUtils;
import de.anomic.htmlFilter.*;
public class plasmaParser {
public final class plasmaParser {
public static String mediaExt =
"swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," +
"sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj";
"sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj";
private final Properties parserList;
private final plasmaParserPool theParserPool;
public plasmaParser(File parserDispatcherPropertyFile) {
// this is only a dummy yet because we have only one parser...
// loading a list of availabe parser from file
Properties prop = new Properties();
try {
prop.load(new FileInputStream(parserDispatcherPropertyFile));
} catch (IOException e) {
System.err.println("ERROR: " + parserDispatcherPropertyFile.toString() + " not found in settings path");
}
this.parserList = prop;
/*
* initializing the parser object pool
*/
GenericKeyedObjectPool.Config config = new GenericKeyedObjectPool.Config();
// The maximum number of active connections that can be allocated from pool at the same time,
// 0 for no limit
config.maxActive = 0;
// The maximum number of idle connections connections in the pool
// 0 = no limit.
config.maxIdle = 10;
config.whenExhaustedAction = GenericObjectPool.WHEN_EXHAUSTED_BLOCK;
config.minEvictableIdleTimeMillis = 30000;
this.theParserPool = new plasmaParserPool(new plasmaParserFactory(),config);
/* testing if all parsers could be loaded properly.
* This is done now to avoid surprises at runtime. */
if (this.parserList.size() > 0) {
Iterator parserIterator = this.parserList.values().iterator();
while (parserIterator.hasNext()) {
String className = (String) parserIterator.next();
try {
Class.forName(className);
} catch (Exception e) {
// if we could not load the parser we remove it from the parser list ...
this.parserList.remove(className);
}
}
}
}
public void close() {
// frees resources; does nothing yet
}
// release resources
try {
// clearing the parser list
this.parserList.clear();
// closing the parser object pool
this.theParserPool.close();
} catch (Exception e) {
//
}
}
public document parseSource(URL location, String mimeType, byte[] source) {
// make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) {
Parser theParser = null;
try {
if ((mimeType != null) && (mimeType.indexOf(";") != -1)) {
mimeType = mimeType.substring(0,mimeType.indexOf(";"));
}
// getting the correct parser for the given mimeType
theParser = this.getParser(mimeType);
// if a parser was found we use it ...
if (theParser != null) {
return theParser.parse(location, mimeType,source);
}
// ...otherwise we make a html scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
hfos.write(source);
return transformScraper(location, mimeType, scraper);
} catch (IOException e) {
} catch (Exception e) {
return null;
} finally {
if (theParser != null) {
try {
this.theParserPool.returnObject(mimeType, theParser);
} catch (Exception e) {
}
}
}
}
public document parseSource(URL location, String mimeType, File sourceFile) {
// make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) {
Parser theParser = null;
try {
serverFileUtils.copy(sourceFile, hfos);
if ((mimeType != null) && (mimeType.indexOf(";") != -1)) {
mimeType = mimeType.substring(0,mimeType.indexOf(";"));
}
// getting the correct parser for the given mimeType
theParser = this.getParser(mimeType);
// if a parser was found we use it ...
if (theParser != null) {
return theParser.parse(location, mimeType,sourceFile);
}
// ...otherwise we make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.copy(sourceFile, hfos);
return transformScraper(location, mimeType, scraper);
} catch (IOException e) {
} catch (Exception e) {
return null;
} finally {
if (theParser != null) {
try {
this.theParserPool.returnObject(mimeType, theParser);
} catch (Exception e) {
}
}
}
}
public document transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
try {
return new document(new URL(urlNormalform(location)),
return new plasmaParserDocument(new URL(urlNormalform(location)),
mimeType, null, null, scraper.getHeadline(),
null, null,
scraper.getText(), scraper.getAnchors(), scraper.getImages());
@ -99,6 +202,41 @@ public class plasmaParser {
}
}
/**
* This function is used to determine the parser class that should be used for a given
* mimetype ...
* @param mimeType
* @return
*/
public Parser getParser(String mimeType) {
if (mimeType == null) {
// TODO: do automatic mimetype detection
return null;
}
try {
if (this.parserList.containsKey(mimeType)) {
String parserClassName = (String)this.parserList.get(mimeType);
// fetching a new parser object from pool
Parser theParser = (Parser) this.theParserPool.borrowObject(parserClassName);
// checking if the created parser really supports the given mimetype
HashSet supportedMimeTypes = theParser.getSupportedMimeTypes();
if ((supportedMimeTypes != null) && (supportedMimeTypes.contains(mimeType))) {
return theParser;
}
this.theParserPool.returnObject(parserClassName,theParser);
}
} catch (Exception e) {
System.err.println("ERROR: Unable to load the correct parser for type " + mimeType);
}
return null;
}
public static String urlNormalform(URL url) {
if (url == null) return null;
return urlNormalform(url.toString());
@ -114,160 +252,7 @@ public class plasmaParser {
return us;
}
public class document {
URL location; // the source url
String mimeType; // mimeType as taken from http header
String keywords; // most resources provide a keyword field
String shortTitle; // a shortTitle mostly appears in the window header (border)
String longTitle; // the real title of the document, commonly h1-tags
String[] sections; // if present: more titles/headlines appearing in the document
String abstrct; // an abstract, if present: short content description
byte[] text; // the clear text, all that is visible
Map anchors; // all links embedded as clickeable entities (anchor tags)
Map images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
Map hyperlinks;
Map medialinks;
Map emaillinks;
public document(URL location, String mimeType,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, Map images) {
this.location = location;
this.mimeType = mimeType;
this.keywords = keywords;
this.shortTitle = shortTitle;
this.longTitle = longTitle;
this.sections = sections;
this.abstrct = abstrct;
this.text = text;
this.anchors = anchors;
this.images = images;
this.hyperlinks = null;
this.medialinks = null;
this.emaillinks = null;
}
private String absolutePath(String relativePath) {
try {
return urlNormalform(new URL(location, relativePath));
} catch (Exception e) {
return "";
}
}
public String getMainShortTitle() {
if (shortTitle != null) return shortTitle; else return longTitle;
}
public String getMainLongTitle() {
if (longTitle != null) return longTitle; else return shortTitle;
}
public String[] getSectionTitles() {
if (sections != null) return sections; else return new String[]{getMainLongTitle()};
}
public String getAbstract() {
if (abstrct != null) return abstrct; else return getMainLongTitle();
}
public byte[] getText() {
// returns only the clear (visible) text (not the source data)
return text;
}
public Map getAnchors() {
// returns all links embedded as anchors (clickeable entities)
return anchors;
}
public Map getImages() {
// returns all links enbedded as pictures (visible iin document)
return images;
}
// the next three methods provide a calculated view on the getAnchors/getImages:
public Map getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (hyperlinks == null) resortLinks();
return hyperlinks;
}
public Map getMedialinks() {
// this is partly subset of getAnchor and getImage: all non-hyperrefs
if (medialinks == null) resortLinks();
return medialinks;
}
public Map getEmaillinks() {
// this is part of the getAnchor-set: only links to email addresses
if (emaillinks == null) resortLinks();
return emaillinks;
}
private synchronized void resortLinks() {
Iterator i;
String url;
int extpos;
String ext;
i = anchors.entrySet().iterator();
hyperlinks = new HashMap();
medialinks = new HashMap();
emaillinks = new HashMap();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
url = (String) entry.getKey();
if ((url != null) && (url.startsWith("mailto:"))) {
emaillinks.put(url.substring(7), entry.getValue());
} else {
extpos = url.lastIndexOf(".");
String normal;
if (extpos > 0) {
ext = url.substring(extpos).toLowerCase();
normal = urlNormalform(url);
if (normal != null) {
if (mediaExt.indexOf(ext.substring(1)) >= 0) {
// this is not an normal anchor, its a media link
medialinks.put(normal, entry.getValue());
} else {
hyperlinks.put(normal, entry.getValue());
}
}
}
}
}
// finally add the images to the medialinks
i = images.entrySet().iterator();
String normal;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
url = (String) entry.getKey();
normal = urlNormalform(url);
if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
}
expandHyperlinks();
}
public synchronized void expandHyperlinks() {
// we add artificial hyperlinks to the hyperlink set that can be calculated from
// given hyperlinks and imagelinks
hyperlinks.putAll(allReflinks(hyperlinks));
hyperlinks.putAll(allReflinks(medialinks));
hyperlinks.putAll(allSubpaths(hyperlinks));
hyperlinks.putAll(allSubpaths(medialinks));
}
}
private static Map allReflinks(Map links) {
static Map allReflinks(Map links) {
// we find all links that are part of a reference inside a url
HashMap v = new HashMap();
Iterator i = links.keySet().iterator();
@ -293,7 +278,7 @@ public class plasmaParser {
return v;
}
private static Map allSubpaths(Map links) {
static Map allSubpaths(Map links) {
HashMap v = new HashMap();
Iterator i = links.keySet().iterator();
String s;
@ -312,4 +297,93 @@ public class plasmaParser {
return v;
}
public static void main(String[] args) {
try {
plasmaParser theParser = new plasmaParser(new File("yacy.parser"));
FileInputStream theInput = new FileInputStream(new File("Y:/public_html/test.pdf"));
ByteArrayOutputStream theOutput = new ByteArrayOutputStream();
serverFileUtils.copy(theInput, theOutput);
theParser.parseSource(new URL("http://brain"),"application/pdf",theOutput.toByteArray());
} catch (Exception e) {
e.printStackTrace();
}
}
}
final class plasmaParserFactory implements KeyedPoolableObjectFactory {
public plasmaParserFactory() {
super();
}
/**
* @see org.apache.commons.pool.PoolableObjectFactory#makeObject()
*/
public Object makeObject(Object key) throws Exception {
if (!(key instanceof String))
throw new IllegalArgumentException("The object key must be of type string.");
Class moduleClass = Class.forName((String)key);
return moduleClass.newInstance();
}
/**
* @see org.apache.commons.pool.PoolableObjectFactory#destroyObject(java.lang.Object)
*/
public void destroyObject(Object key, Object obj) {
if (obj instanceof Parser) {
Parser theParser = (Parser) obj;
}
}
/**
* @see org.apache.commons.pool.PoolableObjectFactory#validateObject(java.lang.Object)
*/
public boolean validateObject(Object key, Object obj) {
if (obj instanceof Parser) {
Parser theParser = (Parser) obj;
return true;
}
return true;
}
/**
* @param obj
*
*/
public void activateObject(Object key, Object obj) {
//log.debug(" activateObject...");
}
/**
* @param obj
*
*/
public void passivateObject(Object key, Object obj) {
//log.debug(" passivateObject..." + obj);
if (obj instanceof Parser) {
Parser theParser = (Parser) obj;
theParser.reset();
}
}
}
final class plasmaParserPool extends GenericKeyedObjectPool {
public plasmaParserPool(plasmaParserFactory objFactory,
GenericKeyedObjectPool.Config config) {
super(objFactory, config);
}
public Object borrowObject(Object key) throws Exception {
return super.borrowObject(key);
}
public void returnObject(Object key, Object borrowed) throws Exception {
super.returnObject(key,borrowed);
}
}

@ -0,0 +1,192 @@
/*
* Check4Update is a stand-alone server application that can be used to
* monitor various types of online resources for updates and changes and
* notifies the user if a modification was detected.
*
* Copyright (C) 2005 Martin Thelian
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* For more information, please email thelian@users.sourceforge.net
*
*/
/* =======================================================================
* Revision Control Information
* $Source: $
* $Author: $
* $Date: $
* $Revision: $
* ======================================================================= */
package de.anomic.plasma;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
public class plasmaParserDocument {
URL location; // the source url
String mimeType; // mimeType as taken from http header
String keywords; // most resources provide a keyword field
String shortTitle; // a shortTitle mostly appears in the window header (border)
String longTitle; // the real title of the document, commonly h1-tags
String[] sections; // if present: more titles/headlines appearing in the document
String abstrct; // an abstract, if present: short content description
byte[] text; // the clear text, all that is visible
Map anchors; // all links embedded as clickeable entities (anchor tags)
Map images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
Map hyperlinks;
Map medialinks;
Map emaillinks;
public plasmaParserDocument(URL location, String mimeType,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, Map images) {
this.location = location;
this.mimeType = mimeType;
this.keywords = keywords;
this.shortTitle = shortTitle;
this.longTitle = longTitle;
this.sections = sections;
this.abstrct = abstrct;
this.text = text;
this.anchors = (anchors==null)?new HashMap():anchors;
this.images = (images==null)?new HashMap():images;
this.hyperlinks = null;
this.medialinks = null;
this.emaillinks = null;
}
private String absolutePath(String relativePath) {
try {
return plasmaParser.urlNormalform(new URL(location, relativePath));
} catch (Exception e) {
return "";
}
}
public String getMainShortTitle() {
if (shortTitle != null) return shortTitle; else return longTitle;
}
public String getMainLongTitle() {
if (longTitle != null) return longTitle; else return shortTitle;
}
public String[] getSectionTitles() {
if (sections != null) return sections; else return new String[]{getMainLongTitle()};
}
public String getAbstract() {
if (abstrct != null) return abstrct; else return getMainLongTitle();
}
public byte[] getText() {
// returns only the clear (visible) text (not the source data)
return text;
}
public Map getAnchors() {
// returns all links embedded as anchors (clickeable entities)
return anchors;
}
public Map getImages() {
// returns all links enbedded as pictures (visible iin document)
return images;
}
// the next three methods provide a calculated view on the getAnchors/getImages:
public Map getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (hyperlinks == null) resortLinks();
return hyperlinks;
}
public Map getMedialinks() {
// this is partly subset of getAnchor and getImage: all non-hyperrefs
if (medialinks == null) resortLinks();
return medialinks;
}
public Map getEmaillinks() {
// this is part of the getAnchor-set: only links to email addresses
if (emaillinks == null) resortLinks();
return emaillinks;
}
private synchronized void resortLinks() {
Iterator i;
String url;
int extpos;
String ext;
i = anchors.entrySet().iterator();
hyperlinks = new HashMap();
medialinks = new HashMap();
emaillinks = new HashMap();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
url = (String) entry.getKey();
if ((url != null) && (url.startsWith("mailto:"))) {
emaillinks.put(url.substring(7), entry.getValue());
} else {
extpos = url.lastIndexOf(".");
String normal;
if (extpos > 0) {
ext = url.substring(extpos).toLowerCase();
normal = plasmaParser.urlNormalform(url);
if (normal != null) {
if (plasmaParser.mediaExt.indexOf(ext.substring(1)) >= 0) {
// this is not an normal anchor, its a media link
medialinks.put(normal, entry.getValue());
} else {
hyperlinks.put(normal, entry.getValue());
}
}
}
}
}
// finally add the images to the medialinks
i = images.entrySet().iterator();
String normal;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
url = (String) entry.getKey();
normal = plasmaParser.urlNormalform(url);
if (normal != null) medialinks.put(normal, entry.getValue()); // avoid NullPointerException
}
expandHyperlinks();
}
public synchronized void expandHyperlinks() {
// we add artificial hyperlinks to the hyperlink set that can be calculated from
// given hyperlinks and imagelinks
hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));
hyperlinks.putAll(plasmaParser.allReflinks(medialinks));
hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));
hyperlinks.putAll(plasmaParser.allSubpaths(medialinks));
}
}

@ -149,7 +149,9 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
public kelondroTables facilityDB;
public plasmaParser parser;
public int serverJobs;
public boolean terminate = false;
private serverSemaphore shutdownSync = new serverSemaphore(0);
private boolean terminate = false;
public plasmaSwitchboard(String rootPath, String initPath, String configPath) throws IOException {
super(rootPath, initPath, configPath);
@ -207,7 +209,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
initProfiles();
// make parser
parser = new plasmaParser(new File(""));
parser = new plasmaParser(new File("yacy.parser"));
// start indexing management
loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL);
@ -502,7 +504,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
log.logDebug(stats + " processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG
// parse content
plasmaParser.document document;
plasmaParserDocument document;
if (entry.scraper != null) {
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is pre-parsed by scraper");
document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
@ -1397,4 +1399,18 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
if (adminAccountBase64MD5.equals(serverCodings.standardCoder.encodeMD5Hex(authorization))) return 4; // hard-authenticated, all ok
return 0; // wrong password
}
public void terminate() {
this.terminate = true;
this.shutdownSync.V();
}
public boolean isTerminated() {
return this.terminate;
}
public boolean waitForShutdown() throws InterruptedException {
this.shutdownSync.P();
return this.terminate;
}
}

@ -0,0 +1,104 @@
//serverSemaphore.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//this file is contributed by Martin Thelian
//last major change: 24.04.2005
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.server;
public final class serverSemaphore {
private long currentValue = 0;
private long maximumValue = Long.MAX_VALUE;
protected serverSemaphore() {
this(0,Long.MAX_VALUE);
}
public serverSemaphore(long initialValue) {
this(initialValue,Long.MAX_VALUE);
}
protected serverSemaphore(long initialValue, long maxValue) {
/* some errorhandling */
if (maxValue < initialValue) {
throw new IllegalArgumentException("The semaphore maximum value must not be " +
"greater than the semaphore init value.");
}
if (maxValue < 1) {
throw new IllegalArgumentException("The semaphore maximum value must be greater or equal 1.");
}
if (initialValue < 0) {
throw new IllegalArgumentException("The semaphore initial value must be greater or equal 0.");
}
// setting the initial Sempahore Values
this.currentValue = initialValue;
this.maximumValue = maxValue;
}
public synchronized void P() throws InterruptedException
{
this.currentValue-- ;
if (this.currentValue < 0) {
try {
wait();
} catch(InterruptedException e) {
this.currentValue++;
throw e;
}
}
}
public synchronized void V() {
if (this.currentValue+1 == this.maximumValue) {
throw new IndexOutOfBoundsException("The maximum value of the semaphore was reached");
}
this.currentValue++;
if (this.currentValue <= 0) {
notify();
}
}
}

@ -79,7 +79,7 @@ public final class yacy {
// static objects
private static final String vString = "@REPL_VERSION@";
private static final String vDATE = "@REPL_DATE@";
private static final String vDATE = "20050422";
private static final String copyright = "[ YACY Proxy v" + vString + ", build " + vDATE + " by Michael Christen / www.yacy.net ]";
private static final String hline = "-------------------------------------------------------------------------------";
@ -237,10 +237,9 @@ public final class yacy {
serverSystem.openBrowser("http://localhost:" + port + "/" + browserPopUpPage, browserPopUpApplication);
}
// loop and wait
while (!(sb.terminate)) try {
Thread.currentThread().sleep(1000); // wait a while
// System.gc(); // prevent that we catch too much memory
// wait for server shutdown
try {
sb.waitForShutdown();
} catch (Exception e) {
serverLog.logError("MAIN CONTROL LOOP", "PANIK: " + e.getMessage());
e.printStackTrace();
@ -259,7 +258,7 @@ public final class yacy {
// idle until the processes are down
while (server.isAlive()) {
Thread.currentThread().sleep(2000); // wait a while
Thread.currentThread().sleep(2000); // wait a while
}
serverLog.logSystem("SHUTDOWN", "server has terminated");
sb.close();

@ -0,0 +1 @@
application/pdf=de.anomic.plasma.parser.pdf.pdfParser
Loading…
Cancel
Save