diff --git a/build.xml b/build.xml index 97723f3bc..3e7b9e535 100644 --- a/build.xml +++ b/build.xml @@ -3,6 +3,7 @@ YaCy - a Peer to Peer search Engine + @@ -10,6 +11,10 @@ + + + + @@ -18,6 +23,7 @@ + @@ -36,7 +42,7 @@ - + @@ -45,9 +51,9 @@ - - - + + + @@ -56,41 +62,19 @@ - - - - - - - - - - - - - - - - - - + + + + + + + + + + - - - - - - - - - - - - - - - + diff --git a/htroot/SettingsAck_p.html b/htroot/SettingsAck_p.html index 1feb7f204..1d7ed9c14 100644 --- a/htroot/SettingsAck_p.html +++ b/htroot/SettingsAck_p.html @@ -63,7 +63,9 @@ The settings have not been changed. :: The submitted peer name is not well-formed. Please choose a different name.
Peer names must not contain characters other than (a-z, A-Z, 0-9, '-', '_') and must not be longer than 80 characters. -The settings have not been changed.#(/info)# +The settings have not been changed. +:: +The new parser settings where changed successfully.#(/info)#

You can now go back to the Settings page if you want to make more changes.

diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index b8792e931..637cfd901 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -46,6 +46,7 @@ import java.util.*; import java.io.*; import de.anomic.tools.*; +import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.*; import de.anomic.yacy.*; import de.anomic.http.*; @@ -251,6 +252,19 @@ public class SettingsAck_p { return prop; } + if (post.containsKey("parserSettings")) { + plasmaSwitchboard sb = (plasmaSwitchboard)env; + post.remove("parserSettings"); + + // activate all received parsers + Enumeration mimeTypeEnum = post.keys(); + sb.parser.setEnabledParserList(mimeTypeEnum); + + prop.put("info", 18); + return prop; + } + + // nothing made prop.put("info", 1);//no information submitted return prop; diff --git a/htroot/Settings_p.html b/htroot/Settings_p.html index bdeef009e..cfde67205 100644 --- a/htroot/Settings_p.html +++ b/htroot/Settings_p.html @@ -171,6 +171,24 @@ but only if there had been changes to the seed-list.
+

+
Content Parser Settings +

Activation/Deactivation of additional content parsers ...

+

+ +#{parser}# + + + + + +#{/parser}# + + + +
#[mime]##[name]#
+

+

#[footer]# diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java index f15e4ac5b..6b3b957e1 100644 --- a/htroot/Settings_p.java +++ b/htroot/Settings_p.java @@ -45,6 +45,7 @@ import java.util.*; import de.anomic.tools.*; +import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.*; import de.anomic.yacy.*; import de.anomic.http.*; @@ -124,8 +125,37 @@ public class Settings_p { prop.put("seedFTPPath", env.getConfig("seedFTPPath", "")); prop.put("seedFTPAccount", env.getConfig("seedFTPAccount", "")); prop.put("seedFTPPassword", env.getConfig("seedFTPPassword", "")); - prop.put("seedURL", env.getConfig("seedURL", "")); + prop.put("seedURL", env.getConfig("seedURL", "")); + + /* + * Parser Configuration + */ + plasmaSwitchboard sb = (plasmaSwitchboard)env; + Hashtable enabledParsers = sb.parser.getEnabledParserList(); + Hashtable availableParsers = sb.parser.getAvailableParserList(); + + // fetching a list of all available mimetypes + List availableParserKeys = Arrays.asList(availableParsers.keySet().toArray(new String[availableParsers.size()])); + + // sort it + Collections.sort(availableParserKeys); + + // loop through the mimeTypes and add it to the properties + int parserIdx = 0; + Iterator availableParserIter = availableParserKeys.iterator(); + while (availableParserIter.hasNext()) { + String mimeType = (String) availableParserIter.next(); + + prop.put("parser_" + parserIdx + "_mime", mimeType); + prop.put("parser_" + parserIdx + "_name", availableParsers.get(mimeType)); + prop.put("parser_" + parserIdx + "_status", enabledParsers.containsKey(mimeType) ? 1:0); + + parserIdx++; + } + + prop.put("parser", parserIdx); + // return rewrite properties return prop; } diff --git a/libx/commons-logging.jar b/libx/commons-logging.jar new file mode 100644 index 000000000..b99c9375a Binary files /dev/null and b/libx/commons-logging.jar differ diff --git a/libx/informa-0.6.0.jar b/libx/informa-0.6.0.jar new file mode 100644 index 000000000..0dd0b807c Binary files /dev/null and b/libx/informa-0.6.0.jar differ diff --git a/libx/informa-0.6.0.license b/libx/informa-0.6.0.license new file mode 100644 index 000000000..ea03906dd --- /dev/null +++ b/libx/informa-0.6.0.license @@ -0,0 +1,24 @@ +// +// Informa -- RSS Library for Java +// Copyright (c) 2002 by Niko Schmuck +// +// Niko Schmuck +// http://sourceforge.net/projects/informa +// mailto:niko_schmuck@users.sourceforge.net +// +// This library is free software. +// +// You may redistribute it and/or modify it under the terms of the GNU +// Lesser General Public License as published by the Free Software Foundation. +// +// Version 2.1 of the license should be included with this distribution in +// the file LICENSE. If the license is not included with this distribution, +// you may find a copy at the FSF web site at 'www.gnu.org' or 'www.fsf.org', +// or you may write to the Free Software Foundation, 675 Mass Ave, Cambridge, +// MA 02139 USA. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied waranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// diff --git a/libx/jdom.jar b/libx/jdom.jar new file mode 100644 index 000000000..0bebc0272 Binary files /dev/null and b/libx/jdom.jar differ diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index 92f00476d..3661c2302 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -398,7 +398,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { return bb; } - protected static serverByteBuffer stripAll(serverByteBuffer bb) { + public static serverByteBuffer stripAll(serverByteBuffer bb) { //return stripAllTags(s); return convertUmlaute(transscriptAll(stripAllTags(bb))); } diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java new file mode 100644 index 000000000..fe702d1eb --- /dev/null +++ b/source/de/anomic/plasma/parser/AbstractParser.java @@ -0,0 +1,125 @@ +//AbstractParser.java +//------------------------ +//part of YaCy +//(C) by Michael Peter Christen; mc@anomic.de +//first published on http://www.anomic.de +//Frankfurt, Germany, 2005 +// +//this file was contributed by Martin Thelian +//last major change: $LastChangedDate$ by $LastChangedBy$ +//Revision: $LastChangedRevision$ +// +//This program is free software; you can redistribute it and/or modify +//it under the terms of the GNU General Public License as published by +//the Free Software Foundation; either version 2 of the License, or +//(at your option) any later version. +// +//This program is distributed in the hope that it will be useful, +//but WITHOUT ANY WARRANTY; without even the implied warranty of +//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//GNU General Public License for more details. +// +//You should have received a copy of the GNU General Public License +//along with this program; if not, write to the Free Software +//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//Using this software in any meaning (reading, learning, copying, compiling, +//running) means that you agree that the Author(s) is (are) not responsible +//for cost, loss of data or any harm that may be caused directly or indirectly +//by usage of this softare or this documentation. The usage of this software +//is on your own risk. The installation and usage (starting/running) of this +//software may allow other people or application to access your computer and +//any attached devices and is highly dependent on the configuration of the +//software which must be done by the user of the software; the author(s) is +//(are) also not responsible for proper configuration and usage of the +//software, even if provoked by documentation provided together with +//the software. +// +//Any changes to this file according to the GPL as documented in the file +//gpl.txt aside this file in the shipment you received can be done to the +//lines that follows this copyright notice here, but changes must not be +//done inside the copyright notive above. A re-distribution must contain +//the intact and unchanged copyright notice. +//Contributions and changes to the program code must be marked as such. + +package de.anomic.plasma.parser; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.InputStream; +import java.net.URL; + +import de.anomic.plasma.plasmaParserDocument; + +/** + * New classes implementing the {@link de.anomic.plasma.parser.Parser} interface + * can extend this class to inherit all functions already implemented in this class. + * @author Martin Thelian + * @version $LastChangedRevision$ / $LastChangedDate$ + */ +public abstract class AbstractParser implements Parser{ + + /** + * The Constructor of this class. + */ + public AbstractParser() { + super(); + } + + /** + * Parsing a document available as byte array. + * @param location the origin of the document + * @param mimeType the mimetype of the document + * @param source the content byte array + * @return a {@link plasmaParserDocument} containing the extracted plain text of the document + * and some additional metadata. + * @throws ParserException if the content could not be parsed properly + * + * @see de.anomic.plasma.parser.Parser#parse(java.net.URL, java.lang.String, byte[]) + */ + public plasmaParserDocument parse(URL location, String mimeType, + byte[] source) throws ParserException { + ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source); + return this.parse(location,mimeType,contentInputStream); + } + + /** + * Parsing a document stored in a {@link File} + * @param location the origin of the document + * @param mimeType the mimetype of the document + * @param sourceFile the file containing the content of the document + * @return a {@link plasmaParserDocument} containing the extracted plain text of the document + * and some additional metadata. + * @throws ParserException if the content could not be parsed properly + * + * @see de.anomic.plasma.parser.Parser#parse(java.net.URL, java.lang.String, java.io.File) + */ + public plasmaParserDocument parse(URL location, String mimeType, + File sourceFile) throws ParserException { + BufferedInputStream contentInputStream = null; + try { + contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile)); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + return this.parse(location, mimeType, contentInputStream); + } + + /** + * Parsing a document available as {@link InputStream} + * @param location the origin of the document + * @param mimeType the mimetype of the document + * @param source the {@link InputStream} containing the document content + * @return a {@link plasmaParserDocument} containing the extracted plain text of the document + * and some additional metadata. + * @throws ParserException if the content could not be parsed properly + * + * @see de.anomic.plasma.parser.Parser#parse(java.net.URL, java.lang.String, java.io.InputStream) + */ + public abstract plasmaParserDocument parse(URL location, String mimeType, + InputStream source) throws ParserException; + +} diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java index 5ae7b7cc9..a714735f1 100644 --- a/source/de/anomic/plasma/parser/Parser.java +++ b/source/de/anomic/plasma/parser/Parser.java @@ -5,8 +5,9 @@ //first published on http://www.anomic.de //Frankfurt, Germany, 2005 // -//this file is contributed by Martin Thelian -//last major change: 24.04.2005 +//this file was contributed by Martin Thelian +//last major change: $LastChangedDate$ by $LastChangedBy$ +//Revision: $LastChangedRevision$ // //This program is free software; you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by @@ -46,23 +47,67 @@ package de.anomic.plasma.parser; import java.io.File; import java.io.InputStream; import java.net.URL; -import java.util.HashSet; +import java.util.Hashtable; import de.anomic.plasma.plasmaParserDocument; +/** + * This interface defines a list of methods that needs to be implemented + * by each content parser class. + * @author Martin Thelian + * @version $LastChangedRevision$ / $LastChangedDate$ + */ public interface Parser { + /** + * Parsing a document available as byte array + * @param location the origin of the document + * @param mimeType the mimetype of the document + * @param source the content byte array + * @return a {@link plasmaParserDocument} containing the extracted plain text of the document + * and some additional metadata. + * + * @throws ParserException if the content could not be parsed properly + */ public plasmaParserDocument parse(URL location, String mimeType, byte[] source) throws ParserException; + /** + * Parsing a document stored in a {@link File} + * @param location the origin of the document + * @param mimeType the mimetype of the document + * @param sourceFile the file containing the content of the document + * @return a {@link plasmaParserDocument} containing the extracted plain text of the document + * and some additional metadata. + * + * @throws ParserException if the content could not be parsed properly + */ public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException; + /** + * Parsing a document available as {@link InputStream} + * @param location the origin of the document + * @param mimeType the mimetype of the document + * @param source the {@link InputStream} containing the document content + * @return a {@link plasmaParserDocument} containing the extracted plain text of the document + * and some additional metadata. + * + * @throws ParserException if the content could not be parsed properly + */ public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException; - public HashSet getSupportedMimeTypes(); + /** + * Can be used to determine the MimeType(s) that are supported by the parser + * @return a {@link Hashtable} containing a list of MimeTypes that are supported by + * the parser + */ + public Hashtable getSupportedMimeTypes(); + /** + * This function should be called before reusing the parser object. + */ public void reset(); diff --git a/source/de/anomic/plasma/parser/doc/build.xml b/source/de/anomic/plasma/parser/doc/build.xml new file mode 100644 index 000000000..863a9585e --- /dev/null +++ b/source/de/anomic/plasma/parser/doc/build.xml @@ -0,0 +1,40 @@ + + + + A class to parse doc documents (application/msword) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java index 6e76a40ce..ae9f80efc 100644 --- a/source/de/anomic/plasma/parser/doc/docParser.java +++ b/source/de/anomic/plasma/parser/doc/docParser.java @@ -43,53 +43,33 @@ package de.anomic.plasma.parser.doc; -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.InputStream; import java.net.URL; -import java.util.Arrays; -import java.util.HashSet; +import java.util.Hashtable; + import org.textmining.text.extraction.WordExtractor; import de.anomic.plasma.plasmaParserDocument; +import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; -public class docParser implements Parser { +public class docParser +extends AbstractParser +implements Parser { /** * a list of mime types that are supported by this parser class - */ - public static final HashSet SUPPORTED_MIME_TYPES = new HashSet(Arrays.asList(new String[] { - new String("application/msword") - })); - + * @see #getSupportedMimeTypes() + */ + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + static { SUPPORTED_MIME_TYPES.put("application/msword","doc"); } public docParser() { super(); } - public plasmaParserDocument parse(URL location, String mimeType, - byte[] source) throws ParserException { - ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source); - return this.parse(location,mimeType,contentInputStream); - } - - public plasmaParserDocument parse(URL location, String mimeType, - File sourceFile) throws ParserException { - BufferedInputStream contentInputStream = null; - try { - contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile)); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - return this.parse(location, mimeType, contentInputStream); - } - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException { @@ -117,21 +97,12 @@ public class docParser implements Parser { } } - public HashSet getSupportedMimeTypes() { + public java.util.Hashtable getSupportedMimeTypes() { return docParser.SUPPORTED_MIME_TYPES; } public void reset() { - // TODO Auto-generated method stub - - } - - /** - * @param args - */ - public static void main(String[] args) { - // TODO Auto-generated method stub - + // Nothing todo here at the moment } } diff --git a/source/de/anomic/plasma/parser/pdf/build.xml b/source/de/anomic/plasma/parser/pdf/build.xml new file mode 100644 index 000000000..188f46a18 --- /dev/null +++ b/source/de/anomic/plasma/parser/pdf/build.xml @@ -0,0 +1,44 @@ + + + + A class to parse pdf documents (application/pdf) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java index 01a25440a..821f200bb 100644 --- a/source/de/anomic/plasma/parser/pdf/pdfParser.java +++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java @@ -54,6 +54,7 @@ import java.io.OutputStreamWriter; import java.net.URL; import java.util.Arrays; import java.util.HashSet; +import java.util.Hashtable; import org.pdfbox.pdfparser.PDFParser; @@ -62,42 +63,36 @@ import org.pdfbox.pdmodel.PDDocumentInformation; import org.pdfbox.util.PDFTextStripper; import de.anomic.plasma.plasmaParserDocument; +import de.anomic.plasma.parser.AbstractParser; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; -public class pdfParser implements Parser +public class pdfParser extends AbstractParser implements Parser { /** * a list of mime types that are supported by this parser class + * @see #getSupportedMimeTypes() */ - public static final HashSet SUPPORTED_MIME_TYPES = new HashSet(Arrays.asList(new String[] { - new String("application/pdf") - })); + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + static { SUPPORTED_MIME_TYPES.put("application/pdf","pdf"); } + + /** + * a list of file extensions that are supported by this parser class + * @see #getSupportedMimeTypes() + */ + public static final HashSet SUPPORTED_FILE_EXT = new HashSet(Arrays.asList(new String[] { + new String("pdf") + })); public pdfParser() { super(); } - public HashSet getSupportedMimeTypes() { + public Hashtable getSupportedMimeTypes() { return SUPPORTED_MIME_TYPES; } - public plasmaParserDocument parse(URL location, String mimeType, File sourceFile) throws ParserException { - BufferedInputStream contentInputStream = null; - try { - contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile)); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - return this.parse(location, mimeType, contentInputStream); - } - - public plasmaParserDocument parse(URL location, String mimeType, byte[] source) throws ParserException { - ByteArrayInputStream contentInputStream = new ByteArrayInputStream(source); - return this.parse(location,mimeType,contentInputStream); - } - public plasmaParserDocument parse(URL location, String mimeType, InputStream source) throws ParserException { try { @@ -155,8 +150,12 @@ public class pdfParser implements Parser } public void reset() { - // TODO Auto-generated method stub + // Nothing todo here at the moment } + public HashSet getSupportedFileExtensions() { + return SUPPORTED_FILE_EXT; + } + } diff --git a/source/de/anomic/plasma/parser/rss/build.xml b/source/de/anomic/plasma/parser/rss/build.xml new file mode 100644 index 000000000..7426e1de7 --- /dev/null +++ b/source/de/anomic/plasma/parser/rss/build.xml @@ -0,0 +1,47 @@ + + + + A class to parse rss/atom feeds + (application/rss+xml, application/rdf+xml, application/atom+xml, application/rss) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java new file mode 100644 index 000000000..44b4b7c1c --- /dev/null +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -0,0 +1,180 @@ +package de.anomic.plasma.parser.rss; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URL; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Map; + +import de.anomic.htmlFilter.htmlFilterAbstractScraper; +import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.htmlFilter.htmlFilterOutputStream; +import de.anomic.plasma.plasmaParserDocument; +import de.anomic.plasma.parser.AbstractParser; +import de.anomic.plasma.parser.Parser; +import de.anomic.plasma.parser.ParserException; +import de.anomic.server.serverByteBuffer; +import de.anomic.server.serverFileUtils; +import de.nava.informa.core.ChannelIF; +import de.nava.informa.core.ImageIF; +import de.nava.informa.impl.basic.ChannelBuilder; +import de.nava.informa.impl.basic.Item; +import de.nava.informa.parsers.FeedParser; + +public class rssParser extends AbstractParser implements Parser { + + /** + * a list of mime types that are supported by this parser class + * @see #getSupportedMimeTypes() + */ + public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable(); + static { + SUPPORTED_MIME_TYPES.put("text/rss","xml,rss,rdf"); + SUPPORTED_MIME_TYPES.put("application/rdf+xml","xml,rss,rdf"); + SUPPORTED_MIME_TYPES.put("application/rss+xml","xml,rss,rdf"); + SUPPORTED_MIME_TYPES.put("application/atom+xml","xml,atom"); + } + + + /** + * a list of file extensions that are supported by this parser class + * @see #getSupportedMimeTypes() + */ + public static final HashSet SUPPORTED_FILE_EXT = new HashSet(Arrays.asList(new String[] { + new String("xml"), + new String("rss"), + new String("rdf"), + new String("atom") + })); + + public rssParser() { + super(); + } + + public plasmaParserDocument parse(URL location, String mimeType, + InputStream source) throws ParserException { + + try { + LinkedList feedSections = new LinkedList(); + HashMap anchors = new HashMap(); + HashMap images = new HashMap(); + serverByteBuffer text = new serverByteBuffer(); + + + // creating a channel-builder + ChannelBuilder builder = new ChannelBuilder(); + + // parsing the rss/atom feed + ChannelIF channel = FeedParser.parse(builder, source); + + // getting the rss feed title and description + String feedTitle = channel.getTitle(); + + // getting the feed description + String feedDescription = channel.getDescription(); + + // getting the channel site url + URL channelSiteURL = channel.getSite(); + + ImageIF channelImage = channel.getImage(); + if (channelImage != null) { + images.put(channelImage.getLocation().toString(),channelImage.getTitle()); + } + + // loop through the feed items + Collection feedItemCollection = channel.getItems(); + if (!feedItemCollection.isEmpty()) { + Iterator feedItemIterator = feedItemCollection.iterator(); + while (feedItemIterator.hasNext()) { + Item item = (Item)feedItemIterator.next(); + + String itemTitle = item.getTitle(); + URL itemURL = item.getLink(); + String itemDescr = item.getDescription(); + + feedSections.add(itemTitle); + anchors.put(itemURL.toString(),itemTitle); + + if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32); + text.append(new serverByteBuffer(htmlFilterAbstractScraper.stripAll(new serverByteBuffer(itemDescr.getBytes()))).trim()).append((byte) ' '); + + String itemContent = item.getElementValue("content"); + if ((itemContent != null) && (itemContent.length() > 0)) { + + htmlFilterContentScraper scraper = new htmlFilterContentScraper(itemURL); + OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); + serverFileUtils.copy(new ByteArrayInputStream(itemContent.getBytes()), os); + + String itemHeadline = scraper.getHeadline(); + if ((itemHeadline != null) && (itemHeadline.length() > 0)) { + feedSections.add(itemHeadline); + } + + Map itemLinks = scraper.getAnchors(); + if ((itemLinks != null) && (itemLinks.size() > 0)) { + anchors.putAll(itemLinks); + } + + Map itemImages = scraper.getImages(); + if ((itemImages != null) && (itemImages.size() > 0)) { + images.putAll(itemImages); + } + + byte[] extractedText = scraper.getText(); + if ((extractedText != null) && (extractedText.length > 0)) { + if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32); + text.append(scraper.getText()); + } + + } + } + } + + /* (URL location, String mimeType, + String keywords, String shortTitle, String longTitle, + String[] sections, String abstrct, + byte[] text, Map anchors, Map images) + */ + plasmaParserDocument theDoc = new plasmaParserDocument( + location, + mimeType, + null, + null, + feedTitle, + (String[]) feedSections.toArray(new String[feedSections.size()]), + feedDescription, + text.getBytes(), + anchors, + images); + + return theDoc; + + } catch (Exception e) { + + } + + return null; + } + + public Hashtable getSupportedMimeTypes() { + return SUPPORTED_MIME_TYPES; + } + + public void reset() { + // TODO Auto-generated method stub + + } + + public HashSet getSupportedFileExtensions() { + // TODO Auto-generated method stub + return SUPPORTED_FILE_EXT; + } + +} diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index ba197bf87..27d5ab5bb 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -1,406 +1,609 @@ -// plasmaParser.java -// ------------------------ -// part of YaCy -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 -// last major change: 12.04.2005 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -// compile: javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java - - -package de.anomic.plasma; - -import java.io.*; -import java.net.*; -import java.util.*; - -import org.apache.commons.pool.KeyedPoolableObjectFactory; -import org.apache.commons.pool.impl.GenericKeyedObjectPool; -import org.apache.commons.pool.impl.GenericObjectPool; - -import de.anomic.plasma.parser.Parser; -import de.anomic.server.serverFileUtils; -import de.anomic.htmlFilter.*; - -public final class plasmaParser { - - private final Properties parserList; - private final plasmaParserPool theParserPool; - - public static HashSet mediaExtSet = new HashSet(); - public static void initMediaExt(String mediaExtString) { - String[] xs = mediaExtString.split(","); - for (int i = 0; i < xs.length; i++) mediaExtSet.add(xs[i]); - } - static { - initMediaExt("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," + - "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"); - } - - public plasmaParser(File parserDispatcherPropertyFile) { - - /* =================================================== - * loading a list of availabe parser from file - * =================================================== */ - Properties prop = new Properties(); - try { - prop.load(new FileInputStream(parserDispatcherPropertyFile)); - } catch (IOException e) { - System.err.println("ERROR: " + parserDispatcherPropertyFile.toString() + " not found in settings path"); - } - this.parserList = prop; - - /* =================================================== - * initializing the parser object pool - * =================================================== */ - GenericKeyedObjectPool.Config config = new GenericKeyedObjectPool.Config(); - - // The maximum number of active connections that can be allocated from pool at the same time, - // 0 for no limit - config.maxActive = 0; - - // The maximum number of idle connections connections in the pool - // 0 = no limit. - config.maxIdle = 10; - - config.whenExhaustedAction = GenericObjectPool.WHEN_EXHAUSTED_BLOCK; - config.minEvictableIdleTimeMillis = 30000; - - this.theParserPool = new plasmaParserPool(new plasmaParserFactory(),config); - - /* =================================================== - * testing if all parsers could be loaded properly. - * This is done now to avoid surprises at runtime. - * =================================================== */ - if (this.parserList.size() > 0) { - Iterator parserIterator = this.parserList.values().iterator(); - while (parserIterator.hasNext()) { - String className = (String) parserIterator.next(); - try { - Class.forName(className); - } catch (Exception e) { - // if we could not load the parser we remove it from the parser list ... - this.parserList.remove(className); - } - } - } - } - - public void close() { - // release resources - try { - // clearing the parser list - this.parserList.clear(); - - // closing the parser object pool - this.theParserPool.close(); - } catch (Exception e) { - // - } - } - - public plasmaParserDocument parseSource(URL location, String mimeType, byte[] source) { - - Parser theParser = null; - try { - - if ((mimeType != null) && (mimeType.indexOf(";") != -1)) { - mimeType = mimeType.substring(0,mimeType.indexOf(";")); - } - - // getting the correct parser for the given mimeType - theParser = this.getParser(mimeType); - - // if a parser was found we use it ... - if (theParser != null) { - return theParser.parse(location, mimeType,source); - } - - // ...otherwise we make a html scraper and transformer - htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); - OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); - - hfos.write(source); - return transformScraper(location, mimeType, scraper); - } catch (Exception e) { - return null; - } finally { - if (theParser != null) { - try { - this.theParserPool.returnObject(mimeType, theParser); - } catch (Exception e) { - } - } - } - } - - public plasmaParserDocument parseSource(URL location, String mimeType, File sourceFile) { - - Parser theParser = null; - try { - if ((mimeType != null) && (mimeType.indexOf(";") != -1)) { - mimeType = mimeType.substring(0,mimeType.indexOf(";")); - } - - // getting the correct parser for the given mimeType - theParser = this.getParser(mimeType); - - // if a parser was found we use it ... - if (theParser != null) { - return theParser.parse(location, mimeType,sourceFile); - } - - // ...otherwise we make a scraper and transformer - htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); - OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); - - serverFileUtils.copy(sourceFile, hfos); - return transformScraper(location, mimeType, scraper); - } catch (Exception e) { - return null; - } finally { - if (theParser != null) { - try { - this.theParserPool.returnObject(mimeType, theParser); - } catch (Exception e) { - } - } - } - } - - public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) { - try { - return new plasmaParserDocument(new URL(urlNormalform(location)), - mimeType, null, null, scraper.getHeadline(), - null, null, - scraper.getText(), scraper.getAnchors(), scraper.getImages()); - } catch (MalformedURLException e) { - return null; - } - } - - /** - * This function is used to determine the parser class that should be used for a given - * mimetype ... - * @param mimeType - * @return - */ - public Parser getParser(String mimeType) { - - if (mimeType == null) { - // TODO: do automatic mimetype detection - return null; - } - - try { - if (this.parserList.containsKey(mimeType)) { - String parserClassName = (String)this.parserList.get(mimeType); - - // fetching a new parser object from pool - Parser theParser = (Parser) this.theParserPool.borrowObject(parserClassName); - - // checking if the created parser really supports the given mimetype - HashSet supportedMimeTypes = theParser.getSupportedMimeTypes(); - if ((supportedMimeTypes != null) && (supportedMimeTypes.contains(mimeType))) { - return theParser; - } - this.theParserPool.returnObject(parserClassName,theParser); - } - } catch (Exception e) { - System.err.println("ERROR: Unable to load the correct parser for type " + mimeType); - } - - return null; - - } - - public static String urlNormalform(URL url) { - if (url == null) return null; - return urlNormalform(url.toString()); - } - - public static String urlNormalform(String us) { - if (us == null) return null; - if (us.length() == 0) return null; - int p; - if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p); - if (us.endsWith(":80")) us = us.substring(0, us.length() - 3); - if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1); - return us; - } - - static Map allReflinks(Map links) { - // we find all links that are part of a reference inside a url - HashMap v = new HashMap(); - Iterator i = links.keySet().iterator(); - String s; - int pos; - loop: while (i.hasNext()) { - s = (String) i.next(); - if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) { - i.remove(); - s = s.substring(pos); - while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos); - if (!(v.containsKey(s))) v.put(s, "ref"); - continue loop; - } - if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) { - i.remove(); - s = "http:/" + s.substring(pos); - while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos); - if (!(v.containsKey(s))) v.put(s, "ref"); - continue loop; - } - } - return v; - } - - static Map allSubpaths(Map links) { - HashMap v = new HashMap(); - Iterator i = links.keySet().iterator(); - String s; - int pos; - while (i.hasNext()) { - s = (String) i.next(); - if (s.endsWith("/")) s = s.substring(0, s.length() - 1); - pos = s.lastIndexOf("/"); - while (pos > 8) { - s = s.substring(0, pos + 1); - if (!(v.containsKey(s))) v.put(s, "sub"); - s = s.substring(0, pos); - pos = s.lastIndexOf("/"); - } - } - return v; - } - - public static void main(String[] args) { - //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java - //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out - try { - File in = new File(args[0]); - File out = new File(args[1]); - plasmaParser theParser = new plasmaParser(new File("yacy.parser")); - FileInputStream theInput = new FileInputStream(in); - ByteArrayOutputStream theOutput = new ByteArrayOutputStream(); - serverFileUtils.copy(theInput, theOutput); - plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "text/html", theOutput.toByteArray()); - //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray()); - byte[] theText = document.getText(); - serverFileUtils.write(theText, out); - } catch (Exception e) { - e.printStackTrace(); - } - } - -} - -final class plasmaParserFactory implements KeyedPoolableObjectFactory { - - public plasmaParserFactory() { - super(); - } - - /** - * @see org.apache.commons.pool.PoolableObjectFactory#makeObject() - */ - public Object makeObject(Object key) throws Exception { - - if (!(key instanceof String)) - throw new IllegalArgumentException("The object key must be of type string."); - - Class moduleClass = Class.forName((String)key); - return moduleClass.newInstance(); - } - - /** - * @see org.apache.commons.pool.PoolableObjectFactory#destroyObject(java.lang.Object) - */ - public void destroyObject(Object key, Object obj) { - if (obj instanceof Parser) { - Parser theParser = (Parser) obj; - } - } - - /** - * @see org.apache.commons.pool.PoolableObjectFactory#validateObject(java.lang.Object) - */ - public boolean validateObject(Object key, Object obj) { - if (obj instanceof Parser) { - Parser theParser = (Parser) obj; - return true; - } - return true; - } - - /** - * @param obj - * - */ - public void activateObject(Object key, Object obj) { - //log.debug(" activateObject..."); - } - - /** - * @param obj - * - */ - public void passivateObject(Object key, Object obj) { - //log.debug(" passivateObject..." + obj); - if (obj instanceof Parser) { - Parser theParser = (Parser) obj; - theParser.reset(); - } - } -} - -final class plasmaParserPool extends GenericKeyedObjectPool { - - public plasmaParserPool(plasmaParserFactory objFactory, - GenericKeyedObjectPool.Config config) { - super(objFactory, config); - } - - - public Object borrowObject(Object key) throws Exception { - return super.borrowObject(key); - } - - public void returnObject(Object key, Object borrowed) throws Exception { - super.returnObject(key,borrowed); - } -} +// plasmaParser.java +// ------------------------ +// part of YaCy +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// +// last major change: 02.05.2005 by Martin Thelian +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +// compile: javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java + + +package de.anomic.plasma; + +import java.io.*; +import java.net.*; +import java.util.*; + +import org.apache.commons.pool.KeyedPoolableObjectFactory; +import org.apache.commons.pool.impl.GenericKeyedObjectPool; +import org.apache.commons.pool.impl.GenericObjectPool; + +import de.anomic.plasma.parser.Parser; +import de.anomic.server.serverFileUtils; +import de.anomic.htmlFilter.*; + +public final class plasmaParser { + + /** + * A list containing all installed parsers and the mimeType that they support + * @see #loadAvailableParserList() + */ + private final Properties availableParserList = new Properties(); + + /** + * A list containing all enabled parsers and the mimeType that they can handle + * @see #loadEnabledParserList() + * @see #setEnabledParserList(Enumeration) + */ + private final Properties enabledParserList = new Properties(); + + /** + * A list of file extensions that are supported by all enabled parsers + */ + private static final HashSet supportedFileExt = new HashSet(); + + /** + * A pool of parsers. + * @see plasmaParserPool + * @see plasmaParserFactory + */ + private final plasmaParserPool theParserPool; + + /** + * The configuration file containing a list of enabled parsers + * @see plasmaParser#plasmaParser(File) + */ + private final File parserDispatcherPropertyFile; + + /** + * A list of media extensions that should not be handled by the plasmaParser + */ + private static final HashSet mediaExtSet = new HashSet(28); + + /** + * This {@link FilenameFilter} is used to find all classes based on there filenames + * which seems to be additional content parsers. + * Currently the filenames of all content parser classes must end with Parser.class + */ + private final FilenameFilter parserFileNameFilter = new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.endsWith("Parser.class"); + } + }; + + /** + * This {@link FileFilter} is used to get all subpackages + * of the parser package. + */ + private final FileFilter parserDirectoryFilter = new FileFilter() { + public boolean accept(File file) { + return file.isDirectory(); + } + }; + + /** + * Initializing the + * @see #initMediaExt(String) + */ + static { + initMediaExt("swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar," + + "sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj"); + } + + public static void initMediaExt(String mediaExtString) { + String[] xs = mediaExtString.split(","); + initMediaExt(Arrays.asList(xs)); + } + + public static void initMediaExt(List mediaExtList) { + synchronized (mediaExtSet) { + mediaExtSet.clear(); + mediaExtSet.addAll(mediaExtList); + } + } + + public static boolean mediaExtContains(String mediaExt) { + + synchronized (supportedFileExt) { + if (supportedFileExt.contains(mediaExt)) return false; + } + + synchronized (mediaExtSet) { + return mediaExtSet.contains(mediaExtSet); + } + } + + + public plasmaParser(File parserDispatcherPropertyFile) { + + this.parserDispatcherPropertyFile = parserDispatcherPropertyFile; + + /* =================================================== + * initializing the parser object pool + * =================================================== */ + GenericKeyedObjectPool.Config config = new GenericKeyedObjectPool.Config(); + + // The maximum number of active connections that can be allocated from pool at the same time, + // 0 for no limit + config.maxActive = 0; + + // The maximum number of idle connections connections in the pool + // 0 = no limit. + config.maxIdle = 10; + + config.whenExhaustedAction = GenericObjectPool.WHEN_EXHAUSTED_BLOCK; + config.minEvictableIdleTimeMillis = 30000; + + this.theParserPool = new plasmaParserPool(new plasmaParserFactory(),config); + + /* =================================================== + * loading a list of available parsers + * =================================================== */ + loadAvailableParserList(); + + /* =================================================== + * loading a list of activated parsers + * =================================================== */ + loadEnabledParserList(); + + } + + public boolean setEnabledParserList(Enumeration mimeTypes) { + + Properties newEnabledParsers = new Properties(); + HashSet newSupportedFileExt = new HashSet(); + + if (mimeTypes != null) { + while (mimeTypes.hasMoreElements()) { + String mimeType = (String) mimeTypes.nextElement(); + if (this.availableParserList.containsKey(mimeType)) { + Parser theParser = null; + try { + // getting the parser + theParser = (Parser) this.theParserPool.borrowObject(this.availableParserList.get(mimeType)); + + // getting a list of mimeTypes that the parser supports + Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes(); + if (parserSupportsMimeTypes != null) { + Object supportedExtensions = parserSupportsMimeTypes.get(mimeType); + if ((supportedExtensions != null) && (supportedExtensions instanceof String)) { + String[] extArray = ((String)supportedExtensions).split(","); + newSupportedFileExt.addAll(Arrays.asList(extArray)); + } + } + newEnabledParsers.put(mimeType,this.availableParserList.get(mimeType)); + + } catch (Exception e) { + e.printStackTrace(); + } finally { + if (theParser != null) + try { this.theParserPool.returnObject(mimeType,theParser); } catch (Exception e) {} + } + } + } + } + + synchronized (this.enabledParserList) { + this.enabledParserList.clear(); + this.enabledParserList.putAll(newEnabledParsers); + } + + + synchronized (supportedFileExt) { + supportedFileExt.clear(); + supportedFileExt.addAll(supportedFileExt); + } + + return true; + } + + public Hashtable getEnabledParserList() { + synchronized (this.enabledParserList) { + return (Hashtable) this.enabledParserList.clone(); + } + } + + public Hashtable getAvailableParserList() { + return this.availableParserList; + } + + private void loadEnabledParserList() { + // loading a list of availabe parser from file + Properties prop = new Properties(); + try { + prop.load(new FileInputStream(this.parserDispatcherPropertyFile)); + } catch (IOException e) { + System.err.println("ERROR: " + this.parserDispatcherPropertyFile.toString() + " not found in settings path"); + } + + // enable them ... + this.setEnabledParserList(prop.keys()); + } + + private void loadAvailableParserList() { + try { + this.availableParserList.clear(); + + // getting the current package name + String plasmaParserPkgName = this.getClass().getPackage().getName() + ".parser"; + System.out.println("INFO: Searching for additional content parsers in package " + plasmaParserPkgName); + + // getting an uri to the parser subpackage + String packageURI = this.getClass().getResource("/"+plasmaParserPkgName.replace('.','/')).toString(); + System.out.println("INFO: Parser directory is " + packageURI); + + // open the parser directory + File parserDir = new File(new URI(packageURI)); + if ((parserDir == null) || (!parserDir.exists()) || (!parserDir.isDirectory())) return; + + /* + * loop through all subdirectories and test if we can + * find an additional parser class + */ + File[] parserDirectories = parserDir.listFiles(this.parserDirectoryFilter); + if (parserDirectories == null) return; + for (int parserDirNr=0; parserDirNr< parserDirectories.length; parserDirNr++) { + File currentDir = parserDirectories[parserDirNr]; + System.out.println("INFO: Searching in directory " + currentDir.toString()); + String[] parserClasses = currentDir.list(this.parserFileNameFilter); + if (parserClasses == null) continue; + + for (int parserNr=0; parserNr= 0) us = us.substring(0, p); + if (us.endsWith(":80")) us = us.substring(0, us.length() - 3); + if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1); + return us; + } + + static Map allReflinks(Map links) { + // we find all links that are part of a reference inside a url + HashMap v = new HashMap(); + Iterator i = links.keySet().iterator(); + String s; + int pos; + loop: while (i.hasNext()) { + s = (String) i.next(); + if ((pos = s.toLowerCase().indexOf("http://",7)) > 0) { + i.remove(); + s = s.substring(pos); + while ((pos = s.toLowerCase().indexOf("http://",7)) > 0) s = s.substring(pos); + if (!(v.containsKey(s))) v.put(s, "ref"); + continue loop; + } + if ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) { + i.remove(); + s = "http:/" + s.substring(pos); + while ((pos = s.toLowerCase().indexOf("/www.",7)) > 0) s = "http:/" + s.substring(pos); + if (!(v.containsKey(s))) v.put(s, "ref"); + continue loop; + } + } + return v; + } + + static Map allSubpaths(Map links) { + HashMap v = new HashMap(); + Iterator i = links.keySet().iterator(); + String s; + int pos; + while (i.hasNext()) { + s = (String) i.next(); + if (s.endsWith("/")) s = s.substring(0, s.length() - 1); + pos = s.lastIndexOf("/"); + while (pos > 8) { + s = s.substring(0, pos + 1); + if (!(v.containsKey(s))) v.put(s, "sub"); + s = s.substring(0, pos); + pos = s.lastIndexOf("/"); + } + } + return v; + } + + public static void main(String[] args) { + //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java + //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out + try { + File in = new File(args[0]); + File out = new File(args[1]); + plasmaParser theParser = new plasmaParser(new File("yacy.parser")); + FileInputStream theInput = new FileInputStream(in); + ByteArrayOutputStream theOutput = new ByteArrayOutputStream(); + serverFileUtils.copy(theInput, theOutput); + plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "text/html", theOutput.toByteArray()); + //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray()); + byte[] theText = document.getText(); + serverFileUtils.write(theText, out); + } catch (Exception e) { + e.printStackTrace(); + } + } + +} + +final class plasmaParserFactory implements KeyedPoolableObjectFactory { + + public plasmaParserFactory() { + super(); + } + + /** + * @see org.apache.commons.pool.PoolableObjectFactory#makeObject() + */ + public Object makeObject(Object key) throws Exception { + + if (!(key instanceof String)) + throw new IllegalArgumentException("The object key must be of type string."); + + Class moduleClass = Class.forName((String)key); + return moduleClass.newInstance(); + } + + /** + * @see org.apache.commons.pool.PoolableObjectFactory#destroyObject(java.lang.Object) + */ + public void destroyObject(Object key, Object obj) { + if (obj instanceof Parser) { + Parser theParser = (Parser) obj; + } + } + + /** + * @see org.apache.commons.pool.PoolableObjectFactory#validateObject(java.lang.Object) + */ + public boolean validateObject(Object key, Object obj) { + if (obj instanceof Parser) { + Parser theParser = (Parser) obj; + return true; + } + return true; + } + + /** + * @param obj + * + */ + public void activateObject(Object key, Object obj) { + //log.debug(" activateObject..."); + } + + /** + * @param obj + * + */ + public void passivateObject(Object key, Object obj) { + //log.debug(" passivateObject..." + obj); + if (obj instanceof Parser) { + Parser theParser = (Parser) obj; + theParser.reset(); + } + } +} + +final class plasmaParserPool extends GenericKeyedObjectPool { + + public plasmaParserPool(plasmaParserFactory objFactory, + GenericKeyedObjectPool.Config config) { + super(objFactory, config); + } + + + public Object borrowObject(Object key) throws Exception { + return super.borrowObject(key); + } + + public void returnObject(Object key, Object borrowed) throws Exception { + super.returnObject(key,borrowed); + } +} + + diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 26692f0ee..c4f895f36 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -148,7 +148,7 @@ public class plasmaParserDocument { Iterator i; String url; int extpos; - String ext; + String ext = null; i = anchors.entrySet().iterator(); hyperlinks = new HashMap(); medialinks = new HashMap(); @@ -163,10 +163,14 @@ public class plasmaParserDocument { extpos = url.lastIndexOf("."); String normal; if (extpos > 0) { - ext = url.substring(extpos).toLowerCase(); + if (url.indexOf("?") != -1) { + ext = url.substring(extpos,url.indexOf("?")).toLowerCase(); + } else { + ext = url.substring(extpos).toLowerCase(); + } normal = plasmaParser.urlNormalform(url); if (normal != null) { - if (plasmaParser.mediaExtSet.contains(ext.substring(1))) { + if (plasmaParser.mediaExtContains(ext.substring(1))) { // this is not an normal anchor, its a media link medialinks.put(normal, entry.getValue()); } else { diff --git a/source/de/anomic/server/serverLog.java b/source/de/anomic/server/serverLog.java index 8cbcceaf4..9489c0032 100644 --- a/source/de/anomic/server/serverLog.java +++ b/source/de/anomic/server/serverLog.java @@ -76,7 +76,7 @@ public final class serverLog { // statics private static serverLog genericLog = new serverLog("GENERIC", LOGLEVEL_DEBUG); // generic log - private static LinkedList lastLog = new LinkedList(); // for Web-Interface + private static LinkedList lastLog = new LinkedList(); // for Web-Interface private static int lastlogMaxSize = 400; // for Web-Interface // class variables diff --git a/startYACY.command b/startYACY.command index d63a3866c..a6f58060c 100755 --- a/startYACY.command +++ b/startYACY.command @@ -1,2 +1,8 @@ cd `dirname $0` -java -classpath classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar -server yacy + +# generating the proper classpath +CLASSPATH="" +for N in `ls -1 lib/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done +for N in `ls -1 libx/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done + +java -classpath classes:$CLASSPATH -server yacy diff --git a/startYACY.sh b/startYACY.sh index 62d9ba3fa..0621b575d 100755 --- a/startYACY.sh +++ b/startYACY.sh @@ -6,13 +6,19 @@ then echo else cd `dirname $0` + + # generating the proper classpath + CLASSPATH="" + for N in `ls -1 lib/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done + for N in `ls -1 libx/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done + if [ x$1 != x-d ] then - nohup java -classpath classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar yacy >> yacy.log & + nohup java -classpath classes:$CLASSPATH yacy >> yacy.log & echo "YaCy started as daemon process. View it's activity in yacy.log" echo "To stop YaCy, please execute stopYACY.sh and wait some seconds" echo "To administrate YaCy, start your web browser and open http://localhost:8080" else - java -classpath classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar yacy + java -classpath classes:$CLASSPATH yacy fi fi diff --git a/stopYACY.command b/stopYACY.command index e99e12af1..7b19fa202 100755 --- a/stopYACY.command +++ b/stopYACY.command @@ -1,2 +1,8 @@ cd `dirname $0` -java -classpath classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar yacy -shutdown + +# generating the proper classpath +CLASSPATH="" +for N in `ls -1 lib/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done +for N in `ls -1 libx/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done + +java -classpath classes:$CLASSPATH yacy -shutdown diff --git a/stopYACY.sh b/stopYACY.sh index b20c3519a..7fafb75fe 100755 --- a/stopYACY.sh +++ b/stopYACY.sh @@ -1,5 +1,11 @@ #!/bin/sh cd `dirname $0` -java -classpath classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar yacy -shutdown + +# generating the proper classpath +CLASSPATH="" +for N in `ls -1 lib/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done +for N in `ls -1 libx/*.jar`; do CLASSPATH="$CLASSPATH$N:"; done + +java -classpath classes:$CLASSPATH yacy -shutdown echo "please wait until the YaCy daemon process terminates" echo "you can monitor this with 'tail -f yacy.log' and 'fuser yacy.log'" \ No newline at end of file diff --git a/yacy.init b/yacy.init index 8605f8ae7..c9815dd08 100644 --- a/yacy.init +++ b/yacy.init @@ -10,7 +10,7 @@ # the http service configurations # port number of server -port = 8080 +port = 8090 # time-out of client control socket in milliseconds # since this applies only to the client-proxy connection, @@ -79,13 +79,13 @@ proxyCache = DATA/HTCACHE proxyCacheSize = 200 # the following mime-types are the whitelist for indexing -parseableMime=application/xhtml+xml,text/html,text/plain +parseableMime=application/xhtml+xml,text/html,text/plain,application/pdf,application/msword # media extension string # a comma-separated list of extensions that denote media file formats # this is important to recognize - tags as not-html reference # These files will be excluded from indexing -mediaExt=swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar,sit,hqx,img,dmg,tar,gz,ps,pdf,doc,xls,ppt,ram,bz2,arj,jar,deb,torrent,ogg,iso,bin,ace,tgz,rpm,css +mediaExt=swf,wmv,jpg,jpeg,jpe,rm,mov,mpg,mpeg,mp3,asf,gif,png,avi,zip,rar,sit,hqx,img,dmg,tar,gz,ps,xls,ppt,ram,bz2,arj,jar,deb,torrent,ogg,iso,bin,ace,tgz,rpm,css # Promotion Strings # These strings appear in the Web Mask of the YACY search client diff --git a/yacy.parser b/yacy.parser index eea29b5ce..1ca222fde 100644 --- a/yacy.parser +++ b/yacy.parser @@ -1,2 +1,8 @@ +#plasmaParser configuration file +#Mon May 02 10:12:02 CEST 2005 +application/atom+xml=de.anomic.plasma.parser.rss.rssParser +text/rss=de.anomic.plasma.parser.rss.rssParser +application/rss+xml=de.anomic.plasma.parser.rss.rssParser +application/rdf+xml=de.anomic.plasma.parser.rss.rssParser +application/msword=de.anomic.plasma.parser.doc.docParser application/pdf=de.anomic.plasma.parser.pdf.pdfParser -application/msword=de.anomic.plasma.parser.doc.docParser \ No newline at end of file