refactoring of yacy documents and parsers: they depend now only on the kelondro classes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6426 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 0fd9540866
commit b79f4f062f

@ -36,6 +36,7 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
@ -46,7 +47,6 @@ import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager;
import de.anomic.data.userDB;
import de.anomic.data.bookmarksDB.Tag;
import de.anomic.document.Document;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;

@ -25,8 +25,8 @@
// javac -classpath .:../Classes Settings_p.java
// if the shell's current path is HTROOT
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import net.yacy.document.Idiom;
import net.yacy.document.Parser;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;

@ -24,11 +24,11 @@
import java.io.File;
import net.yacy.document.content.dao.Dao;
import net.yacy.document.content.dao.ImportDump;
import net.yacy.document.content.dao.PhpBB3Dao;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.content.dao.Dao;
import de.anomic.content.dao.ImportDump;
import de.anomic.content.dao.PhpBB3Dao;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;

@ -25,11 +25,11 @@
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.document.parser.xml.RSSReader;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.content.RSSMessage;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.document.parser.xml.RSSReader;
import de.anomic.http.server.RequestHeader;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;

@ -36,6 +36,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Set;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -50,7 +51,6 @@ import net.yacy.kelondro.util.DateFormatter;
import de.anomic.data.AbstractBlacklist;
import de.anomic.data.Blacklist;
import de.anomic.data.listManager;
import de.anomic.document.Condenser;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.QueryParams;
import de.anomic.search.RankingProcess;

@ -37,10 +37,10 @@ import java.util.Map.Entry;
import java.util.ArrayList;
import java.util.HashMap;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;

@ -35,16 +35,17 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.Condenser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.document.parser.html.ImageEntry;
import de.anomic.http.client.Client;
import de.anomic.http.client.Cache;
import de.anomic.http.server.RequestHeader;
@ -266,7 +267,7 @@ public class ViewFile {
// parsing the resource content
Document document = null;
try {
document = Document.parseDocument(url, resourceLength, resource);
document = LoaderDispatcher.parseDocument(url, resourceLength, resource);
if (document == null) {
prop.put("error", "5");
prop.put("error_errorText", "Unknown error");

@ -36,6 +36,8 @@ import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
@ -45,8 +47,6 @@ import de.anomic.crawler.ZURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.TransformerWriter;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segment;
import de.anomic.search.Segments;

@ -3,11 +3,11 @@
import java.util.Date;
import java.util.Iterator;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.data.bookmarksDB;
import de.anomic.data.userDB;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;

@ -3,10 +3,10 @@
import java.util.Date;
import java.util.Iterator;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.data.bookmarksDB;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;

@ -2,8 +2,9 @@
import java.util.Date;
import de.anomic.content.RSSMessage;
import de.anomic.document.parser.xml.RSSFeed;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;

@ -3,10 +3,11 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Set;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.crawler.CrawlProfile;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
@ -49,11 +50,11 @@ public class getpageinfo_p {
}
ContentScraper scraper = null;
if (u != null) try {
scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
} catch (final IOException e) {
// try again, try harder
try {
scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST);
scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST);
} catch (final IOException ee) {
// now thats a fail, do nothing
}

@ -30,10 +30,10 @@ import java.io.File;
import java.io.IOException;
import java.util.Date;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;

@ -30,12 +30,12 @@ import java.text.ParseException;
import java.util.Date;
import java.util.Iterator;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.content.RSSMessage;
import de.anomic.crawler.retrieval.Request;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;

@ -44,7 +44,6 @@ import de.anomic.data.AbstractBlacklist;
import de.anomic.data.listManager;
import de.anomic.data.list.ListAccumulator;
import de.anomic.data.list.XMLBlacklistImporter;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
@ -52,6 +51,7 @@ import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacySeed;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;

@ -34,6 +34,8 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.order.Base64Order;
@ -42,8 +44,6 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.SortStack;
import net.yacy.kelondro.util.ISO639;
import de.anomic.content.RSSMessage;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.net.natLib;

@ -32,13 +32,13 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.content.RSSMessage;
import de.anomic.data.Blacklist;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;

@ -29,12 +29,12 @@
import java.io.IOException;
import java.text.ParseException;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.content.RSSMessage;
import de.anomic.data.Blacklist;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;

@ -33,6 +33,10 @@ import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -44,14 +48,10 @@ import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.SetTools;
import net.yacy.kelondro.util.ISO639;
import de.anomic.content.RSSMessage;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.data.DidYouMean;
import de.anomic.data.LibraryProvider;
import de.anomic.data.Location;
import de.anomic.document.Condenser;
import de.anomic.document.Document;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.QueryParams;

@ -36,6 +36,8 @@ import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.table.SplitTable;
@ -43,10 +45,8 @@ import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.workflow.WorkflowJob;
import de.anomic.content.RSSMessage;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.client.Client;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;

@ -24,17 +24,16 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.content.oai;
package de.anomic.crawler;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.file.SurrogateReader;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.content.DCEntry;
import de.anomic.content.file.SurrogateReader;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Request;

@ -30,10 +30,10 @@ import java.util.HashMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.document.Document;
import de.anomic.document.parser.html.ImageEntry;
public class ResultImages {

@ -32,12 +32,12 @@ import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.crawler.Latency;
import de.anomic.document.Parser;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseHeader;

@ -28,12 +28,12 @@ package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.util.Date;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.Latency;
import de.anomic.data.Blacklist;
import de.anomic.document.Parser;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;

@ -29,6 +29,7 @@ package de.anomic.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.util.Arrays;
import java.util.Date;
import java.util.HashSet;
@ -36,13 +37,17 @@ import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.CrawlProfile;
import de.anomic.document.Document;
import de.anomic.document.ParserException;
import de.anomic.http.client.Cache;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseHeader;
@ -352,7 +357,7 @@ public final class LoaderDispatcher {
// parse resource
Document document = null;
try {
document = Document.parseDocument(url, resContentLength, resContent, responseHeader);
document = parseDocument(url, resContentLength, resContent, responseHeader);
} catch (final ParserException e) {
Log.logFine("snippet fetch", "parser error " + e.getMessage() + " for url " + url);
return null;
@ -362,6 +367,75 @@ public final class LoaderDispatcher {
return document;
}
/**
* Parse the resource
* @param url the URL of the resource
* @param contentLength the contentLength of the resource
* @param resourceStream the resource body as stream
* @param docInfo metadata about the resource
* @return the extracted data
* @throws ParserException
*/
public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream, ResponseHeader responseHeader) throws ParserException {
try {
if (resourceStream == null) return null;
// STEP 1: if no resource metadata is available, try to load it from cache
if (responseHeader == null) {
// try to get the header from the htcache directory
try {
responseHeader = Cache.getResponseHeader(url);
} catch (final Exception e) {
// ignore this. resource info loading failed
}
}
// STEP 2: if the metadata is still null try to download it from web
if ((responseHeader == null) && (url.getProtocol().startsWith("http"))) {
// TODO: we need a better solution here
// e.g. encapsulate this in the crawlLoader class
// getting URL mimeType
try {
responseHeader = Client.whead(url.toString());
} catch (final Exception e) {
// ingore this. http header download failed
}
}
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
String supportError = Parser.supports(url, responseHeader == null ? null : responseHeader.mime());
if (supportError != null) {
return null;
}
if (responseHeader == null) {
return Parser.parseSource(url, null, null, contentLength, resourceStream);
}
return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
} catch (final InterruptedException e) {
// interruption of thread detected
return null;
}
}
public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream) throws ParserException {
return parseDocument(url, contentLength, resourceStream, null);
}
public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException {
// load page
Response r = loader.load(location, true, false, cachePolicy);
byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
// scrape content
final ContentScraper scraper = new ContentScraper(location);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
writer.write(new String(page, "UTF-8"));
return scraper;
}
public synchronized void cleanupAccessTimeTable(long timeout) {
final Iterator<Map.Entry<String, Long>> i = accessTime.entrySet().iterator();

@ -28,12 +28,12 @@ package de.anomic.crawler.retrieval;
import java.util.Date;
import net.yacy.document.Classification;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.crawler.CrawlProfile;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseHeader;

@ -55,6 +55,8 @@ import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.blob.Heap;
import net.yacy.kelondro.blob.MapView;
import net.yacy.kelondro.data.meta.DigestURI;
@ -76,8 +78,6 @@ import org.xml.sax.SAXException;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Request;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.TransformerWriter;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.yacy.yacyNewsPool;

@ -30,7 +30,8 @@ package de.anomic.data;
import java.util.ArrayList;
import de.anomic.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.CharacterCoding;
/**
* This class provides a diff-functionality.

@ -32,7 +32,8 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import de.anomic.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.CharacterCoding;
import de.anomic.server.serverCore;
/** This class provides methods to handle texts that have been posted in the yacyWiki or other

@ -81,6 +81,10 @@ import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
import java.util.zip.GZIPOutputStream;
import net.yacy.document.Classification;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
@ -88,10 +92,6 @@ import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import de.anomic.document.Classification;
import de.anomic.document.parser.htmlParser;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ScraperInputStream;
import de.anomic.http.server.servlets.transferURL;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;

@ -71,6 +71,9 @@ import java.util.logging.LogManager;
import java.util.logging.Logger;
import java.util.zip.GZIPOutputStream;
import net.yacy.document.Parser;
import net.yacy.document.parser.html.ContentTransformer;
import net.yacy.document.parser.html.Transformer;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.ByteCountOutputStream;
import net.yacy.kelondro.logging.Log;
@ -82,9 +85,6 @@ import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
import de.anomic.document.Parser;
import de.anomic.document.parser.html.ContentTransformer;
import de.anomic.document.parser.html.Transformer;
import de.anomic.http.client.MultiOutputStream;
import de.anomic.http.client.Client;
import de.anomic.http.client.RemoteProxyConfig;

@ -50,6 +50,7 @@ import java.util.StringTokenizer;
import java.util.Map.Entry;
import java.util.zip.GZIPInputStream;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
@ -72,7 +73,6 @@ import org.apache.commons.httpclient.ChunkedInputStream;
import org.apache.commons.httpclient.ContentLengthInputStream;
import de.anomic.data.userDB;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.search.Switchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverHandler;

@ -56,10 +56,10 @@ import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PushbackInputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import net.yacy.document.parser.html.ContentTransformer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.FileUtils;
@ -122,83 +122,6 @@ import net.yacy.kelondro.util.FileUtils;
*/
public final class TemplateEngine {
public final static byte hash = (byte)'#';
public final static byte[] dpdpa = "::".getBytes();
private final static byte lbr = (byte)'[';
private final static byte rbr = (byte)']';
private final static byte[] pOpen = {hash, lbr};
private final static byte[] pClose = {rbr, hash};
private final static byte lcbr = (byte)'{';
private final static byte rcbr = (byte)'}';
private final static byte[] mOpen = {hash, lcbr};
private final static byte[] mClose = {rcbr, hash};
private final static byte lrbr = (byte)'(';
private final static byte rrbr = (byte)')';
private final static byte[] aOpen = {hash, lrbr};
private final static byte[] aClose = {rrbr, hash};
private final static byte ps = (byte)'%';
private final static byte[] iOpen = {hash, ps};
private final static byte[] iClose = {ps, hash};
private final static byte[] slash = {(byte)'/'};
private final static Object[] meta_quotation = new Object[] {
new Object[] {pOpen, pClose},
new Object[] {mOpen, mClose},
new Object[] {aOpen, aClose},
new Object[] {iOpen, iClose}
};
public final static ByteBuffer[] splitQuotations(final ByteBuffer text) {
final List<ByteBuffer> l = splitQuotation(text, 0);
final ByteBuffer[] sbbs = new ByteBuffer[l.size()];
for (int i = 0; i < l.size(); i++) sbbs[i] = l.get(i);
return sbbs;
}
private final static List<ByteBuffer> splitQuotation(ByteBuffer text, int qoff) {
final ArrayList<ByteBuffer> l = new ArrayList<ByteBuffer>();
if (qoff >= meta_quotation.length) {
if (text.length() > 0) l.add(text);
return l;
}
int p = -1, q;
final byte[] left = (byte[]) ((Object[]) meta_quotation[qoff])[0];
final byte[] right = (byte[]) ((Object[]) meta_quotation[qoff])[1];
qoff++;
while ((text.length() > 0) && ((p = text.indexOf(left)) >= 0)) {
q = text.indexOf(right, p + 1);
if (q >= 0) {
// found a pattern
l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff));
l.add(new ByteBuffer(text.getBytes(p, q + right.length - p)));
text = new ByteBuffer(text.getBytes(q + right.length));
} else {
// found only pattern start, no closing parantesis (a syntax error that is silently accepted here)
l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff));
l.addAll(splitQuotation(new ByteBuffer(text.getBytes(p)), qoff));
text.clear();
}
}
// find double-points
while ((text.length() > 0) && ((p = text.indexOf(dpdpa)) >= 0)) {
l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff));
l.add(new ByteBuffer(dpdpa));
l.addAll(splitQuotation(new ByteBuffer(text.getBytes(p + 2)), qoff));
text.clear();
}
// add remaining
if (text.length() > 0) l.addAll(splitQuotation(text, qoff));
return l;
}
/**
* transfer until a specified pattern is found; everything but the pattern is transfered so far
* the function returns true, if the pattern is found
@ -254,13 +177,13 @@ public final class TemplateEngine {
byte[] replacement;
int bb;
final ByteBuffer structure = new ByteBuffer();
while (transferUntil(pis, out, hash)) {
while (transferUntil(pis, out, ContentTransformer.hashChar)) {
bb = pis.read();
keyStream.reset();
// #{
if ((bb & 0xFF) == lcbr) { //multi
if (transferUntil(pis, keyStream, mClose)) { //close tag
if ((bb & 0xFF) == ContentTransformer.lcbr) { //multi
if (transferUntil(pis, keyStream, ContentTransformer.mClose)) { //close tag
//multi_key = "_" + keyStream.toString(); //for _Key
bb = pis.read();
if ((bb & 0xFF) != 10){ //kill newline
@ -270,7 +193,7 @@ public final class TemplateEngine {
keyStream.reset(); //reset stream
//this needs multi_key without prefix
if (transferUntil(pis, keyStream, appendBytes(mOpen,slash,multi_key,mClose))){
if (transferUntil(pis, keyStream, appendBytes(ContentTransformer.mOpen, ContentTransformer.slashChar, multi_key, ContentTransformer.mClose))){
bb = pis.read();
if((bb & 0xFF) != 10){ //kill newline
pis.unread(bb);
@ -305,11 +228,11 @@ public final class TemplateEngine {
}
// #(
} else if ((bb & 0xFF) == lrbr) { //alternative
} else if ((bb & 0xFF) == ContentTransformer.lrbr) { //alternative
int others=0;
final ByteBuffer text= new ByteBuffer();
transferUntil(pis, keyStream, aClose);
transferUntil(pis, keyStream, ContentTransformer.aClose);
key = keyStream.toByteArray(); //Caution: Key does not contain prefix
keyStream.reset(); //clear
@ -341,7 +264,7 @@ public final class TemplateEngine {
return structure.getBytes();
}
keyStream.reset();
transferUntil(pis, keyStream, dpdpa);
transferUntil(pis, keyStream, ContentTransformer.dpdpa);
pis2 = new PushbackInputStream(new ByteArrayInputStream(keyStream.toByteArray()));
structure.append(writeTemplate(pis2, out, pattern, dflt, newPrefix(prefix,key)));
transferUntil(pis, keyStream, appendBytes("#(/".getBytes(),key,")#".getBytes("UTF-8"),null));
@ -351,13 +274,13 @@ public final class TemplateEngine {
} else {
while(!found){
bb=pis.read(); // performance problem? trace always points to this line
if ((bb & 0xFF) == hash){
if ((bb & 0xFF) == ContentTransformer.hashChar){
bb=pis.read();
if ((bb & 0xFF) == lrbr){
transferUntil(pis, keyStream, aClose);
if ((bb & 0xFF) == ContentTransformer.lrbr){
transferUntil(pis, keyStream, ContentTransformer.aClose);
//reached the end. output last string.
if (java.util.Arrays.equals(keyStream.toByteArray(),appendBytes(slash, key, null,null))) {
if (java.util.Arrays.equals(keyStream.toByteArray(),appendBytes(ContentTransformer.slashChar, key, null,null))) {
pis2 = new PushbackInputStream(new ByteArrayInputStream(text.getBytes()));
//this maybe the wrong, but its the last
structure.append('<').append(key).append(" type=\"alternative\" which=\"".getBytes()).append(Integer.toString(whichPattern).getBytes()).append("\" found=\"0\">\n".getBytes());
@ -366,16 +289,16 @@ public final class TemplateEngine {
found=true;
}else if(others >0 && keyStream.toString().startsWith("/")){ //close nested
others--;
text.append(aOpen).append(keyStream.toByteArray()).append(")#".getBytes());
text.append(ContentTransformer.aOpen).append(keyStream.toByteArray()).append(")#".getBytes());
} else { //nested
others++;
text.append(aOpen).append(keyStream.toByteArray()).append(")#".getBytes());
text.append(ContentTransformer.aOpen).append(keyStream.toByteArray()).append(")#".getBytes());
}
keyStream.reset(); //reset stream
continue;
} //is not #(
pis.unread(bb);//is processed in next loop
bb = (hash);//will be added to text this loop
bb = (ContentTransformer.hashChar);//will be added to text this loop
//text += "#";
}else if ((bb & 0xFF) == ':' && others==0){//ignore :: in nested Expressions
bb=pis.read();
@ -407,8 +330,8 @@ public final class TemplateEngine {
}//if(byName) (else branch)
// #[
} else if ((bb & 0xFF) == lbr) { //normal
if (transferUntil(pis, keyStream, pClose)) {
} else if ((bb & 0xFF) == ContentTransformer.lbr) { //normal
if (transferUntil(pis, keyStream, ContentTransformer.pClose)) {
// pattern detected, write replacement
key = keyStream.toByteArray();
final String patternKey = getPatternKey(prefix, key);
@ -425,13 +348,13 @@ public final class TemplateEngine {
}
// #%
} else if ((bb & 0xFF) == ps) { //include
} else if ((bb & 0xFF) == ContentTransformer.pcChar) { //include
final ByteBuffer include = new ByteBuffer();
keyStream.reset(); //reset stream
if(transferUntil(pis, keyStream, iClose)){
if(transferUntil(pis, keyStream, ContentTransformer.iClose)){
byte[] filename = keyStream.toByteArray();
//if(filename.startsWith( Character.toString((char)lbr) ) && filename.endsWith( Character.toString((char)rbr) )){ //simple pattern for filename
if((filename[0] == lbr) && (filename[filename.length-1] == rbr)){ //simple pattern for filename
if((filename[0] == ContentTransformer.lbr) && (filename[filename.length-1] == ContentTransformer.rbr)){ //simple pattern for filename
final byte[] newFilename = new byte[filename.length-2];
System.arraycopy(filename, 1, newFilename, 0, newFilename.length);
final String patternkey = getPatternKey(prefix, newFilename);
@ -462,7 +385,7 @@ public final class TemplateEngine {
// # - no special character. This is simply a '#' without meaning
} else { //no match, but a single hash (output # + bb)
out.write(hash);
out.write(ContentTransformer.hashChar);
out.write(bb);
}
}

@ -5,12 +5,12 @@ package de.anomic.http.server.servlets;
import java.io.IOException;
import java.text.ParseException;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.content.RSSMessage;
import de.anomic.data.Blacklist;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;

@ -34,14 +34,14 @@ import java.util.Date;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import de.anomic.document.Condenser;
import de.anomic.document.Document;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
/**
* convenience class to access the yacycore library from outside of yacy to put files into the index

@ -29,12 +29,12 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.document.Document;
import de.anomic.document.parser.html.ImageEntry;
public class MediaSnippet {
public int type;

@ -38,6 +38,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReference;
@ -51,7 +52,6 @@ import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.util.ScoreCluster;
import de.anomic.data.Blacklist;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.http.client.Client;
import de.anomic.http.client.RemoteProxyConfig;
import de.anomic.http.server.ResponseContainer;

@ -30,6 +30,9 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;
import net.yacy.document.Condenser;
import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.order.Base64Order;
@ -37,9 +40,6 @@ import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.SetTools;
import de.anomic.document.Condenser;
import de.anomic.document.parser.html.AbstractScraper;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.yacy.yacySeed;
public final class QueryParams {

@ -41,6 +41,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -54,7 +55,6 @@ import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.SortStack;
import de.anomic.document.Condenser;
import de.anomic.server.serverProfiling;
import de.anomic.ymage.ProfilingGraph;

@ -32,6 +32,7 @@ import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
@ -40,7 +41,6 @@ import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.ScoreCluster;
import de.anomic.document.Condenser;
public class ReferenceOrder {

@ -30,6 +30,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -37,7 +38,6 @@ import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.rwi.Reference;
import de.anomic.document.Condenser;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;

@ -31,13 +31,13 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.TreeSet;
import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.SetTools;
import net.yacy.kelondro.util.SortStack;
import net.yacy.kelondro.util.SortStore;
import de.anomic.document.Condenser;
import de.anomic.search.RankingProcess.NavigatorEntry;
import de.anomic.search.MediaSnippet;
import de.anomic.server.serverProfiling;

@ -34,6 +34,8 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.navigation.NavigationReference;
@ -53,8 +55,6 @@ import net.yacy.kelondro.util.ISO639;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.Blacklist;
import de.anomic.document.Condenser;
import de.anomic.document.Document;
public class Segment {

@ -32,14 +32,14 @@ import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.IndexCell;
import de.anomic.document.Condenser;
import de.anomic.document.Document;
public class Segments implements Iterable<Segment> {

@ -111,6 +111,14 @@ import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.ParserException;
import net.yacy.document.content.DCEntry;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.content.file.SurrogateReader;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -129,9 +137,6 @@ import net.yacy.kelondro.workflow.WorkflowJob;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.kelondro.workflow.WorkflowThread;
import de.anomic.content.DCEntry;
import de.anomic.content.RSSMessage;
import de.anomic.content.file.SurrogateReader;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.CrawlStacker;
@ -161,11 +166,6 @@ import de.anomic.data.userDB;
import de.anomic.data.wiki.wikiBoard;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
import de.anomic.document.Condenser;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.client.Client;
import de.anomic.http.client.RemoteProxyConfig;
import de.anomic.http.client.Cache;
@ -1696,12 +1696,6 @@ public final class Switchboard extends serverSwitch {
}
}
public indexingQueueEntry webStructureAnalysis(final indexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_STRUCTUREANALYSIS);
in.document.notifyWebStructure(webStructure, in.condenser, in.queueEntry.lastModified());
return in;
}
public void storeDocumentIndex(final indexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_INDEXSTORAGE);
storeDocumentIndex(in.process, in.queueEntry, in.document, in.condenser);
@ -1839,7 +1833,7 @@ public final class Switchboard extends serverSwitch {
final Long resourceContentLength = (Long) resource[1];
// parse the resource
final Document document = Document.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent);
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent);
// get the word set
Set<String> words = null;

@ -35,6 +35,10 @@ import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
@ -44,11 +48,8 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.SetTools;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.Condenser;
import de.anomic.document.Document;
import de.anomic.document.ParserException;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.http.client.Cache;
import de.anomic.http.server.ResponseHeader;
import de.anomic.yacy.yacySearch;
@ -387,7 +388,7 @@ public class TextSnippet {
* =========================================================================== */
Document document = null;
try {
document = Document.parseDocument(url, resContentLength, resContent, responseHeader);
document = LoaderDispatcher.parseDocument(url, resContentLength, resContent, responseHeader);
} catch (final ParserException e) {
return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
} finally {

@ -52,10 +52,10 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.Formatter;
import de.anomic.document.parser.html.CharacterCoding;
import de.anomic.search.Switchboard;
public class serverObjects extends HashMap<String, String> implements Cloneable {

@ -46,9 +46,9 @@ import java.security.spec.InvalidKeySpecException;
import java.security.spec.PKCS8EncodedKeySpec;
import java.security.spec.X509EncodedKeySpec;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.order.Base64Order;
import de.anomic.server.serverCharBuffer;
/**
* Tool functions to sign and verify files and generate keys
@ -141,7 +141,7 @@ public class CryptoLib {
} else if(args[0].equals("--sign") && args.length==3) {
CryptoLib cl = new CryptoLib();
serverCharBuffer privKeyBuffer = new serverCharBuffer(new File(args[1]));
CharBuffer privKeyBuffer = new CharBuffer(new File(args[1]));
byte[] privKeyByteBuffer = Base64Order.standardCoder.decode(privKeyBuffer.toString());
PrivateKey privKey = cl.getPrivateKeyFromBytes(privKeyByteBuffer);
@ -153,13 +153,13 @@ public class CryptoLib {
signFile.close();
} else if(args[0].equals("--verify") && args.length==3) {
CryptoLib cl = new CryptoLib();
serverCharBuffer pubKeyBuffer = new serverCharBuffer(new File(args[1]));
CharBuffer pubKeyBuffer = new CharBuffer(new File(args[1]));
byte[] pubKeyByteBuffer = Base64Order.standardCoder.decode(pubKeyBuffer.toString().trim());
PublicKey pubKey = cl.getPublicKeyFromBytes(pubKeyByteBuffer);
FileInputStream dataStream = new FileInputStream(args[2]);
serverCharBuffer signBuffer = new serverCharBuffer(new File(args[2] + ".sig"));
CharBuffer signBuffer = new CharBuffer(new File(args[2] + ".sig"));
byte[] signByteBuffer = Base64Order.standardCoder.decode(signBuffer.toString().trim());
if(cl.verifySignature(pubKey, dataStream, signByteBuffer)) {
System.out.println("Signature OK!");

@ -26,6 +26,9 @@
package de.anomic.tools;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
@ -62,9 +65,6 @@ import java.util.concurrent.TimeoutException;
import de.anomic.data.wiki.wikiCode;
import de.anomic.data.wiki.wikiParser;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
/*
* this class provides data structures to read a mediawiki dump file in xml format

@ -55,6 +55,8 @@ import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.document.parser.xml.RSSReader;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
@ -73,8 +75,6 @@ import org.apache.commons.httpclient.methods.multipart.Part;
import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.data.Blacklist;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.document.parser.xml.RSSReader;
import de.anomic.http.client.DefaultCharsetFilePart;
import de.anomic.http.client.DefaultCharsetStringPart;
import de.anomic.http.client.Client;

@ -48,12 +48,12 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.content.RSSMessage;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.search.Switchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverSemaphore;

@ -27,12 +27,12 @@ package de.anomic.yacy;
import java.io.IOException;
import java.util.HashMap;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.MapTools;
import de.anomic.content.RSSMessage;
import de.anomic.document.parser.xml.RSSFeed;
public class yacyPeerActions {

@ -44,20 +44,21 @@ import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseContainer;
import de.anomic.search.Switchboard;
import de.anomic.server.serverCharBuffer;
import de.anomic.server.serverCore;
import de.anomic.server.serverSystem;
import de.anomic.tools.CryptoLib;
@ -234,7 +235,7 @@ public final class yacyRelease extends yacyVersion {
// returns the version info if successful, null otherwise
ContentScraper scraper;
try {
scraper = ContentScraper.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CACHE_STRATEGY_NOCACHE);
scraper = LoaderDispatcher.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CACHE_STRATEGY_NOCACHE);
} catch (final IOException e) {
return null;
}
@ -359,8 +360,8 @@ public final class yacyRelease extends yacyVersion {
public boolean checkSignature() {
if(releaseFile != null) {
try {
serverCharBuffer signBuffer;
signBuffer = new serverCharBuffer(getSignatureFile());
CharBuffer signBuffer;
signBuffer = new CharBuffer(getSignatureFile());
byte[] signByteBuffer = Base64Order.standardCoder.decode(signBuffer.toString().trim());
CryptoLib cl = new CryptoLib();
for(yacyUpdateLocation updateLocation : latestReleaseLocations) {

@ -37,6 +37,8 @@ import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
@ -44,8 +46,6 @@ import net.yacy.kelondro.order.MicroDate;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.document.Condenser;
import de.anomic.document.Document;
public class WebStructureGraph {
@ -58,7 +58,7 @@ public class WebStructureGraph {
private final Log log;
private final File rankingPath, structureFile;
private final String crlFile, crgFile;
TreeMap<String, String> structure_old, structure_new; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
private TreeMap<String, String> structure_old, structure_new; // <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
public WebStructureGraph(final Log log, final File rankingPath, final String crlFile, final String crgFile, final File structureFile) {
this.log = log;

@ -43,6 +43,13 @@ public class ymageGraph {
// a ymageGraph is a set of points and borders between the points
// to reference the points, they must all have a nickname
public static final long color_back = 0xFFFFFF;
public static final long color_text = 0x888888;
private static final long color_dot = 0x11BB11;
private static final long color_line = 0x222222;
private static final long color_lineend = 0x333333;
HashMap<String, coordinate> points;
HashSet<String> borders;
double leftmost, rightmost, topmost, bottommost;
@ -124,12 +131,6 @@ public class ymageGraph {
}
}
public static final long color_back = 0xFFFFFF;
public static final long color_text = 0xAAAAAA;
private static final long color_dot = 0x11CC11;
private static final long color_line = 0x333333;
private static final long color_lineend = 0x666666;
public ymageMatrix draw(final int width, final int height, final int leftborder, final int rightborder, final int topborder, final int bottomborder) {
final ymageMatrix image = new ymageMatrix(width, height, ymageMatrix.MODE_SUB, color_back);
final double xfactor = ((rightmost - leftmost) == 0.0) ? 0.0 : (width - leftborder - rightborder) / (rightmost - leftmost);

@ -23,7 +23,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
package net.yacy.document;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
@ -39,7 +39,7 @@ import net.yacy.kelondro.workflow.WorkflowThread;
/**
* New classes implementing the {@link de.anomic.document.Idiom} interface
* New classes implementing the {@link net.yacy.document.Idiom} interface
* can extend this class to inherit all functions already implemented in this class.
* @author Martin Thelian
* @version $LastChangedRevision$ / $LastChangedDate$
@ -148,7 +148,7 @@ public abstract class AbstractParser implements Idiom {
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, byte[])
* @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, byte[])
*/
public Document parse(
final DigestURI location,
@ -183,7 +183,7 @@ public abstract class AbstractParser implements Idiom {
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.File)
* @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.File)
*/
public Document parse(
final DigestURI location,
@ -218,7 +218,7 @@ public abstract class AbstractParser implements Idiom {
* and some additional metadata.
* @throws ParserException if the content could not be parsed properly
*
* @see de.anomic.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
* @see net.yacy.document.Idiom#parse(de.anomic.net.URL, java.lang.String, java.io.InputStream)
*/
public abstract Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
package net.yacy.document;
import java.io.BufferedInputStream;
import java.io.File;

@ -23,7 +23,7 @@
// compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java
// execute with java -cp source de.anomic.plasma.plasmaCondenser
package de.anomic.document;
package net.yacy.document;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
@ -46,6 +46,9 @@ import java.util.Properties;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow;
@ -53,9 +56,6 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.util.SetTools;
import de.anomic.document.language.Identificator;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ImageEntry;
public final class Condenser {

@ -21,7 +21,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
package net.yacy.document;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
@ -43,17 +43,13 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CachedFileOutputStream;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ImageEntry;
import de.anomic.http.client.Cache;
import de.anomic.http.client.Client;
import de.anomic.http.server.ResponseHeader;
import de.anomic.server.serverCachedFileOutputStream;
import de.anomic.ymage.WebStructureGraph;
public class Document {
@ -104,7 +100,7 @@ public class Document {
this.languages = languages;
if (text == null) try {
this.text = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
} catch (final IOException e) {
e.printStackTrace();
this.text = new StringBuilder();
@ -134,7 +130,7 @@ public class Document {
public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final serverCachedFileOutputStream text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {
final CachedFileOutputStream text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
@ -251,8 +247,8 @@ dc_rights
this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
} else if (this.text instanceof byte[]) {
this.textStream = new ByteArrayInputStream((byte[])this.text);
} else if (this.text instanceof serverCachedFileOutputStream) {
return ((serverCachedFileOutputStream)this.text).getContent();
} else if (this.text instanceof CachedFileOutputStream) {
return ((CachedFileOutputStream)this.text).getContent();
}
return this.textStream;
} catch (final Exception e) {
@ -269,8 +265,8 @@ dc_rights
return FileUtils.read((File)this.text);
} else if (this.text instanceof byte[]) {
return (byte[])this.text;
} else if (this.text instanceof serverCachedFileOutputStream) {
final serverCachedFileOutputStream ffbaos = (serverCachedFileOutputStream)this.text;
} else if (this.text instanceof CachedFileOutputStream) {
final CachedFileOutputStream ffbaos = (CachedFileOutputStream)this.text;
if (ffbaos.isFallback()) {
return FileUtils.read(ffbaos.getContent());
}
@ -286,8 +282,8 @@ dc_rights
if (this.text == null) return 0;
if (this.text instanceof File) return ((File)this.text).length();
else if (this.text instanceof byte[]) return ((byte[])this.text).length;
else if (this.text instanceof serverCachedFileOutputStream) {
return ((serverCachedFileOutputStream)this.text).getLength();
else if (this.text instanceof CachedFileOutputStream) {
return ((CachedFileOutputStream)this.text).getLength();
}
return -1;
@ -525,11 +521,11 @@ dc_rights
if (this.description.length() > 0) this.description.append('\n');
this.description.append(doc.dc_description());
if (!(this.text instanceof serverCachedFileOutputStream)) {
this.text = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
FileUtils.copy(getText(), (serverCachedFileOutputStream)this.text);
if (!(this.text instanceof CachedFileOutputStream)) {
this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
FileUtils.copy(getText(), (CachedFileOutputStream)this.text);
}
FileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);
FileUtils.copy(doc.getText(), (CachedFileOutputStream)this.text);
anchors.putAll(doc.getAnchors());
ContentScraper.addAllImages(images, doc.getImages());
@ -549,12 +545,6 @@ dc_rights
this.favicon = faviconURL;
}
public void notifyWebStructure(final WebStructureGraph webStructure, final Condenser condenser, final Date docDate) {
final Integer[] ioLinks = webStructure.generateCitationReference(this, condenser, docDate); // [outlinksSame, outlinksOther]
this.inboundLinks = ioLinks[0].intValue();
this.outboundLinks = ioLinks[1].intValue();
}
public int inboundLinks() {
return (this.inboundLinks < 0) ? 0 : this.inboundLinks;
}
@ -608,61 +598,5 @@ dc_rights
this.close();
super.finalize();
}
/**
* Parse the resource
* @param url the URL of the resource
* @param contentLength the contentLength of the resource
* @param resourceStream the resource body as stream
* @param docInfo metadata about the resource
* @return the extracted data
* @throws ParserException
*/
public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream, ResponseHeader responseHeader) throws ParserException {
try {
if (resourceStream == null) return null;
// STEP 1: if no resource metadata is available, try to load it from cache
if (responseHeader == null) {
// try to get the header from the htcache directory
try {
responseHeader = Cache.getResponseHeader(url);
} catch (final Exception e) {
// ignore this. resource info loading failed
}
}
// STEP 2: if the metadata is still null try to download it from web
if ((responseHeader == null) && (url.getProtocol().startsWith("http"))) {
// TODO: we need a better solution here
// e.g. encapsulate this in the crawlLoader class
// getting URL mimeType
try {
responseHeader = Client.whead(url.toString());
} catch (final Exception e) {
// ingore this. http header download failed
}
}
// STEP 3: if the metadata is still null try to guess the mimeType of the resource
String supportError = Parser.supports(url, responseHeader == null ? null : responseHeader.mime());
if (supportError != null) {
return null;
}
if (responseHeader == null) {
return Parser.parseSource(url, null, null, contentLength, resourceStream);
}
return Parser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), contentLength, resourceStream);
} catch (final InterruptedException e) {
// interruption of thread detected
return null;
}
}
public static Document parseDocument(final DigestURI url, final long contentLength, final InputStream resourceStream) throws ParserException {
return parseDocument(url, contentLength, resourceStream, null);
}
}

@ -23,7 +23,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
package net.yacy.document;
import java.io.File;
import java.io.InputStream;

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
package net.yacy.document;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
@ -39,27 +39,27 @@ import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.document.parser.bzipParser;
import net.yacy.document.parser.docParser;
import net.yacy.document.parser.gzipParser;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.odtParser;
import net.yacy.document.parser.ooxmlParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.pptParser;
import net.yacy.document.parser.psParser;
import net.yacy.document.parser.rssParser;
import net.yacy.document.parser.rtfParser;
import net.yacy.document.parser.sevenzipParser;
import net.yacy.document.parser.swfParser;
import net.yacy.document.parser.tarParser;
import net.yacy.document.parser.vcfParser;
import net.yacy.document.parser.vsdParser;
import net.yacy.document.parser.xlsParser;
import net.yacy.document.parser.zipParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import de.anomic.document.parser.bzipParser;
import de.anomic.document.parser.docParser;
import de.anomic.document.parser.gzipParser;
import de.anomic.document.parser.htmlParser;
import de.anomic.document.parser.odtParser;
import de.anomic.document.parser.ooxmlParser;
import de.anomic.document.parser.pdfParser;
import de.anomic.document.parser.pptParser;
import de.anomic.document.parser.psParser;
import de.anomic.document.parser.rssParser;
import de.anomic.document.parser.rtfParser;
import de.anomic.document.parser.sevenzipParser;
import de.anomic.document.parser.swfParser;
import de.anomic.document.parser.tarParser;
import de.anomic.document.parser.vcfParser;
import de.anomic.document.parser.vsdParser;
import de.anomic.document.parser.xlsParser;
import de.anomic.document.parser.zipParser;
public final class Parser {

@ -22,7 +22,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
package net.yacy.document;
import net.yacy.kelondro.data.meta.DigestURI;

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document;
package net.yacy.document;
import java.util.HashSet;

@ -23,7 +23,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.content;
package net.yacy.document.content;
import java.io.IOException;
import java.io.OutputStreamWriter;
@ -36,10 +36,10 @@ import java.util.HashSet;
import java.util.Locale;
import java.util.TreeMap;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.document.Document;
public class DCEntry extends TreeMap<String, String> {

@ -25,7 +25,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.content;
package net.yacy.document.content;
import java.util.Date;
import java.util.HashMap;

@ -22,14 +22,15 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.content.dao;
package net.yacy.document.content.dao;
import java.io.File;
import java.sql.SQLException;
import java.util.Date;
import java.util.concurrent.BlockingQueue;
import de.anomic.content.DCEntry;
import net.yacy.document.content.DCEntry;
/*
* Database Access Objects are used to get a normalized view on database objects with java objects

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.content.dao;
package net.yacy.document.content.dao;
import java.sql.Connection;
import java.sql.DriverManager;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.content.dao;
package net.yacy.document.content.dao;
import java.io.ByteArrayOutputStream;
import java.io.File;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.content.dao;
package net.yacy.document.content.dao;
import java.io.BufferedOutputStream;
import java.io.File;
@ -39,9 +39,9 @@ import java.util.HashMap;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import net.yacy.document.content.DCEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.content.DCEntry;
public class PhpBB3Dao implements Dao {

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.content.file;
package net.yacy.document.content.file;
import java.io.BufferedInputStream;
import java.io.File;
@ -36,11 +36,12 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.document.content.DCEntry;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import de.anomic.content.DCEntry;
public class SurrogateReader extends DefaultHandler implements Runnable {

@ -22,7 +22,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.detector;
package net.yacy.document.detector;
import java.io.File;
import java.io.IOException;

@ -22,7 +22,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.detector;
package net.yacy.document.detector;
import java.io.ByteArrayInputStream;
import java.io.File;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.language;
package net.yacy.document.language;
import java.util.HashMap;
import java.util.Iterator;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.language;
package net.yacy.document.language;
import java.io.File;
import java.io.FilenameFilter;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.language;
package net.yacy.document.language;
import java.io.BufferedReader;
import java.io.File;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.language;
package net.yacy.document.language;
import java.io.File;
import java.io.FilenameFilter;

@ -25,7 +25,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.File;
import java.io.FileOutputStream;
@ -33,16 +33,16 @@ import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.Parser;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import org.apache.tools.bzip2.CBZip2InputStream;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
public class bzipParser extends AbstractParser implements Idiom {

@ -25,17 +25,17 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Set;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.poi.hwpf.extractor.WordExtractor;

@ -25,7 +25,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.File;
import java.io.FileOutputStream;
@ -34,14 +34,14 @@ import java.util.HashSet;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.Parser;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
public class gzipParser extends AbstractParser implements Idiom {

@ -27,7 +27,7 @@
// MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE.
// CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH.
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import java.util.HashSet;
import java.util.Properties;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import java.util.Properties;
import java.util.TreeSet;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import java.util.HashMap;

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import java.io.ByteArrayInputStream;
import java.io.File;
@ -43,14 +43,12 @@ import java.util.Properties;
import javax.swing.event.EventListenerList;
import net.yacy.document.parser.htmlParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.ISO639;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.parser.htmlParser;
import de.anomic.server.serverCharBuffer;
public class ContentScraper extends AbstractScraper implements Scraper {
@ -84,7 +82,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private String title;
//private String headline;
private List<String>[] headlines;
private serverCharBuffer content;
private CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
/**
@ -109,7 +107,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.title = "";
this.headlines = new ArrayList[4];
for (int i = 0; i < 4; i++) headlines[i] = new ArrayList<String>();
this.content = new serverCharBuffer(1024);
this.content = new CharBuffer(1024);
this.htmlFilterEventListeners = new EventListenerList();
}
@ -504,20 +502,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return scraper;
}
public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException {
// load page
Response r = loader.load(location, true, false, cachePolicy);
byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
// scrape content
final ContentScraper scraper = new ContentScraper(location);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
writer.write(new String(page, "UTF-8"));
return scraper;
}
public static void addAllImages(final HashMap<String, ImageEntry> a, final HashMap<String, ImageEntry> b) {
final Iterator<Map.Entry<String, ImageEntry>> i = b.entrySet().iterator();
Map.Entry<String, ImageEntry> ie;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import java.io.BufferedReader;
import java.io.File;
@ -30,18 +30,48 @@ import java.io.FileReader;
import java.io.IOException;
import java.text.Collator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.TreeSet;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.ByteBuffer;
import de.anomic.http.server.TemplateEngine;
import de.anomic.server.serverCharBuffer;
public class ContentTransformer extends AbstractTransformer implements Transformer {
// statics: for initialisation of the HTMLFilterAbstractTransformer
public final static byte hashChar = (byte)'#';
public final static byte[] slashChar = {(byte)'/'};
public final static byte pcChar = (byte)'%';
public final static byte[] dpdpa = "::".getBytes();
public final static byte lbr = (byte)'[';
public final static byte rbr = (byte)']';
public final static byte[] pOpen = {hashChar, lbr};
public final static byte[] pClose = {rbr, hashChar};
public final static byte lcbr = (byte)'{';
public final static byte rcbr = (byte)'}';
public final static byte[] mOpen = {hashChar, lcbr};
public final static byte[] mClose = {rcbr, hashChar};
public final static byte lrbr = (byte)'(';
public final static byte rrbr = (byte)')';
public final static byte[] aOpen = {hashChar, lrbr};
public final static byte[] aClose = {rrbr, hashChar};
public final static byte[] iOpen = {hashChar, pcChar};
public final static byte[] iClose = {pcChar, hashChar};
private final static Object[] meta_quotation = new Object[] {
new Object[] {pOpen, pClose},
new Object[] {mOpen, mClose},
new Object[] {aOpen, aClose},
new Object[] {iOpen, iClose}
};
// statics: for initialization of the HTMLFilterAbstractTransformer
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
private static final TreeSet<String> linkTags0 = new TreeSet<String>(insensitiveCollator);;
private static final TreeSet<String> linkTags1 = new TreeSet<String>(insensitiveCollator);;
@ -89,7 +119,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
private static char[] genBlueLetters(int length) {
final serverCharBuffer bb = new serverCharBuffer(" <FONT COLOR=#0000FF>".toCharArray());
final CharBuffer bb = new CharBuffer(" <FONT COLOR=#0000FF>".toCharArray());
length = length / 2;
if (length > 10) length = 7;
while (length-- > 0) {
@ -118,13 +148,13 @@ public class ContentTransformer extends AbstractTransformer implements Transform
final ArrayList<String> result = new ArrayList<String>();
final ByteBuffer sbb = new ByteBuffer(text);
final ByteBuffer[] sbbs = TemplateEngine.splitQuotations(sbb);
final ByteBuffer[] sbbs = splitQuotations(sbb);
for (int i = 0; i < sbbs.length; i++) {
// TODO: avoid empty if statements
if (sbbs[i].isWhitespace(true)) {
//sbb.append(sbbs[i]);
} else if ((sbbs[i].byteAt(0) == TemplateEngine.hash) ||
(sbbs[i].startsWith(TemplateEngine.dpdpa))) {
} else if ((sbbs[i].byteAt(0) == hashChar) ||
(sbbs[i].startsWith(dpdpa))) {
// this is a template or a part of a template
//sbb.append(sbbs[i]);
} else {
@ -136,6 +166,52 @@ public class ContentTransformer extends AbstractTransformer implements Transform
}
return result;
}
public final static ByteBuffer[] splitQuotations(final ByteBuffer text) {
final List<ByteBuffer> l = splitQuotation(text, 0);
final ByteBuffer[] sbbs = new ByteBuffer[l.size()];
for (int i = 0; i < l.size(); i++) sbbs[i] = l.get(i);
return sbbs;
}
private final static List<ByteBuffer> splitQuotation(ByteBuffer text, int qoff) {
final ArrayList<ByteBuffer> l = new ArrayList<ByteBuffer>();
if (qoff >= meta_quotation.length) {
if (text.length() > 0) l.add(text);
return l;
}
int p = -1, q;
final byte[] left = (byte[]) ((Object[]) meta_quotation[qoff])[0];
final byte[] right = (byte[]) ((Object[]) meta_quotation[qoff])[1];
qoff++;
while ((text.length() > 0) && ((p = text.indexOf(left)) >= 0)) {
q = text.indexOf(right, p + 1);
if (q >= 0) {
// found a pattern
l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff));
l.add(new ByteBuffer(text.getBytes(p, q + right.length - p)));
text = new ByteBuffer(text.getBytes(q + right.length));
} else {
// found only pattern start, no closing parantesis (a syntax error that is silently accepted here)
l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff));
l.addAll(splitQuotation(new ByteBuffer(text.getBytes(p)), qoff));
text.clear();
}
}
// find double-points
while ((text.length() > 0) && ((p = text.indexOf(dpdpa)) >= 0)) {
l.addAll(splitQuotation(new ByteBuffer(text.getBytes(0, p)), qoff));
l.add(new ByteBuffer(dpdpa));
l.addAll(splitQuotation(new ByteBuffer(text.getBytes(p + 2)), qoff));
text.clear();
}
// add remaining
if (text.length() > 0) l.addAll(splitQuotation(text, qoff));
return l;
}
public char[] transformText(final char[] text) {
if (bluelist != null) {
if (bluelistHit(text)) {

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import net.yacy.kelondro.data.meta.DigestURI;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import java.util.Properties;

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import java.io.BufferedInputStream;
import java.io.IOException;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import java.util.Properties;

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import java.util.Properties;

@ -29,7 +29,7 @@
*/
package de.anomic.document.parser.html;
package net.yacy.document.parser.html;
import java.io.File;
import java.io.FileOutputStream;
@ -45,8 +45,8 @@ import java.util.Enumeration;
import java.util.Properties;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import de.anomic.server.serverCharBuffer;
public final class TransformerWriter extends Writer {
@ -59,10 +59,10 @@ public final class TransformerWriter extends Writer {
private final OutputStream outStream;
private OutputStreamWriter out;
private serverCharBuffer buffer;
private CharBuffer buffer;
private String filterTag;
private Properties filterOpts;
private serverCharBuffer filterCont;
private CharBuffer filterCont;
private final Scraper scraper;
private final Transformer transformer;
private boolean inSingleQuote;
@ -83,7 +83,7 @@ public final class TransformerWriter extends Writer {
this.outStream = outStream;
this.scraper = scraper;
this.transformer = transformer;
this.buffer = new serverCharBuffer(1024);
this.buffer = new CharBuffer(1024);
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
@ -101,7 +101,7 @@ public final class TransformerWriter extends Writer {
}
public static char[] genTag0raw(final String tagname, final boolean opening, final char[] tagopts) {
final serverCharBuffer bb = new serverCharBuffer(tagname.length() + tagopts.length + 3);
final CharBuffer bb = new CharBuffer(tagname.length() + tagopts.length + 3);
bb.append((int)'<');
if (!opening) {
bb.append((int)'/');
@ -123,7 +123,7 @@ public final class TransformerWriter extends Writer {
}
public static char[] genTag1raw(final String tagname, final char[] tagopts, final char[] text) {
final serverCharBuffer bb = new serverCharBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
final CharBuffer bb = new CharBuffer(2 * tagname.length() + tagopts.length + text.length + 5);
bb.append((int)'<').append(tagname);
if (tagopts.length > 0) {
// if (tagopts[0] == (byte) 32)
@ -144,7 +144,7 @@ public final class TransformerWriter extends Writer {
public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) {
final char[] tagoptsx = (tagopts.size() == 0) ? null : genOpts(tagopts, quotechar);
final serverCharBuffer bb = new serverCharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
final CharBuffer bb = new CharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
bb.append((int)'<').append(tagname);
if (tagoptsx != null) {
bb.append(32);
@ -162,7 +162,7 @@ public final class TransformerWriter extends Writer {
public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
final char[] gt0 = genTag0(tagname, tagopts, quotechar);
final serverCharBuffer cb = new serverCharBuffer(gt0, gt0.length + text.length + tagname.length() + 3);
final CharBuffer cb = new CharBuffer(gt0, gt0.length + text.length + tagname.length() + 3);
cb.append(text).append((int)'<').append((int)'/').append(tagname).append((int)'>');
final char[] result = cb.getChars();
try {
@ -176,7 +176,7 @@ public final class TransformerWriter extends Writer {
// a helper method for pretty-printing of properties for html tags
public static char[] genOpts(final Properties prop, final char quotechar) {
final Enumeration<?> e = prop.propertyNames();
final serverCharBuffer bb = new serverCharBuffer(prop.size() * 40);
final CharBuffer bb = new CharBuffer(prop.size() * 40);
String key;
while (e.hasMoreElements()) {
key = (String) e.nextElement();
@ -212,7 +212,7 @@ public final class TransformerWriter extends Writer {
if (opening) {
if ((scraper != null) && (scraper.isTag0(tag))) {
// this single tag is collected at once here
final serverCharBuffer charBuffer = new serverCharBuffer(content);
final CharBuffer charBuffer = new CharBuffer(content);
scraper.scrapeTag0(tag, charBuffer.propParser());
try {
charBuffer.close();
@ -223,7 +223,7 @@ public final class TransformerWriter extends Writer {
}
if ((transformer != null) && (transformer.isTag0(tag))) {
// this single tag is collected at once here
final serverCharBuffer scb = new serverCharBuffer(content);
final CharBuffer scb = new CharBuffer(content);
try {
return transformer.transformTag0(tag, scb.propParser(), quotechar);
} finally {
@ -237,14 +237,14 @@ public final class TransformerWriter extends Writer {
((transformer != null) && (transformer.isTag1(tag)))) {
// ok, start collecting
filterTag = tag;
final serverCharBuffer scb = new serverCharBuffer(content);
final CharBuffer scb = new CharBuffer(content);
filterOpts = scb.propParser();
try {
scb.close();
} catch (IOException e) {
e.printStackTrace();
}
filterCont = new serverCharBuffer();
filterCont = new CharBuffer();
return new char[0];
} else {
// we ignore that thing and return it again

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
@ -34,16 +34,16 @@ import java.nio.charset.UnsupportedCharsetException;
import java.util.HashSet;
import java.util.Set;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.document.AbstractParser;
import de.anomic.document.Document;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ScraperInputStream;
import de.anomic.document.parser.html.TransformerWriter;
public class htmlParser extends AbstractParser implements Idiom {

@ -25,9 +25,8 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
@ -42,21 +41,17 @@ import java.util.zip.ZipFile;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.parser.xml.ODContentHandler;
import net.yacy.document.parser.xml.ODMetaHandler;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.document.parser.xml.ODContentHandler;
import de.anomic.document.parser.xml.ODMetaHandler;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.server.serverCharBuffer;
public class odtParser extends AbstractParser implements Idiom {
@ -148,7 +143,7 @@ public class odtParser extends AbstractParser implements Idiom {
writerFile = File.createTempFile("odtParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
writer = new serverCharBuffer();
writer = new CharBuffer();
}
// extract data
@ -192,8 +187,8 @@ public class odtParser extends AbstractParser implements Idiom {
// create the parser document
Document theDoc = null;
if (writer instanceof serverCharBuffer) {
final byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8");
if (writer instanceof CharBuffer) {
final byte[] contentBytes = ((CharBuffer)writer).toString().getBytes("UTF-8");
theDoc = new Document(
location,
mimeType,
@ -264,27 +259,4 @@ public class odtParser extends AbstractParser implements Idiom {
// Nothing todo here at the moment
super.reset();
}
public static void main(final String[] args) {
try {
if (args.length != 1) return;
// getting the content URL
final DigestURI contentUrl = new DigestURI(args[0], null);
// creating a new parser
final odtParser testParser = new odtParser();
// downloading the document content
final RequestHeader reqHeader = new RequestHeader();
reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
final byte[] content = Client.wget(contentUrl.toString(), reqHeader, 10000);
final ByteArrayInputStream input = new ByteArrayInputStream(content);
// parsing the document
testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input);
} catch (final Exception e) {
e.printStackTrace();
}
}
}

@ -25,9 +25,8 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
@ -42,21 +41,17 @@ import java.util.zip.ZipFile;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.parser.xml.ODContentHandler;
import net.yacy.document.parser.xml.ODMetaHandler;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.document.parser.xml.ODContentHandler;
import de.anomic.document.parser.xml.ODMetaHandler;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.server.serverCharBuffer;
public class ooxmlParser extends AbstractParser implements Idiom {
@ -133,7 +128,7 @@ public class ooxmlParser extends AbstractParser implements Idiom {
writerFile = File.createTempFile("ooxmlParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
writer = new serverCharBuffer();
writer = new CharBuffer();
}
// extract data
@ -178,8 +173,8 @@ public class ooxmlParser extends AbstractParser implements Idiom {
// create the parser document
Document theDoc = null;
if (writer instanceof serverCharBuffer) {
final byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8");
if (writer instanceof CharBuffer) {
final byte[] contentBytes = ((CharBuffer)writer).toString().getBytes("UTF-8");
theDoc = new Document(
location,
mimeType,
@ -250,27 +245,4 @@ public class ooxmlParser extends AbstractParser implements Idiom {
// Nothing todo here at the moment
super.reset();
}
public static void main(final String[] args) {
try {
if (args.length != 1) return;
// getting the content URL
final DigestURI contentUrl = new DigestURI(args[0], null);
// creating a new parser
final odtParser testParser = new odtParser();
// downloading the document content
final RequestHeader reqHeader = new RequestHeader();
reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
final byte[] content = Client.wget(contentUrl.toString(), reqHeader, 10000);
final ByteArrayInputStream input = new ByteArrayInputStream(content);
// parsing the document
testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input);
} catch (final Exception e) {
e.printStackTrace();
}
}
}

@ -25,7 +25,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.File;
import java.io.FileOutputStream;
@ -43,15 +43,15 @@ import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.util.PDFTextStripper;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.server.serverCharBuffer;
public class pdfParser extends AbstractParser implements Idiom {
@ -133,7 +133,7 @@ public class pdfParser extends AbstractParser implements Idiom {
writerFile = File.createTempFile("pdfParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
writer = new serverCharBuffer();
writer = new CharBuffer();
}
try {
stripper.writeText(theDocument, writer ); // may throw a NPE
@ -149,8 +149,8 @@ public class pdfParser extends AbstractParser implements Idiom {
Document theDoc = null;
if (writer instanceof serverCharBuffer) {
final byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8");
if (writer instanceof CharBuffer) {
final byte[] contentBytes = ((CharBuffer)writer).toString().getBytes("UTF-8");
theDoc = new Document(
location,
mimeType,

@ -25,21 +25,21 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
public class pptParser extends AbstractParser implements Idiom {

@ -25,7 +25,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.BufferedReader;
import java.io.BufferedWriter;
@ -37,13 +37,13 @@ import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
public class psParser extends AbstractParser implements Idiom {

@ -25,7 +25,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.ByteArrayInputStream;
import java.io.IOException;
@ -38,22 +38,22 @@ import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.content.RSSMessage;
import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.document.parser.xml.RSSFeed;
import net.yacy.document.parser.xml.RSSReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.content.RSSMessage;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.document.parser.html.AbstractScraper;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.document.parser.html.ImageEntry;
import de.anomic.document.parser.html.TransformerWriter;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.document.parser.xml.RSSReader;
import de.anomic.server.serverCharBuffer;
public class rssParser extends AbstractParser implements Idiom {
@ -84,7 +84,7 @@ public class rssParser extends AbstractParser implements Idiom {
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
final HashMap<String, ImageEntry> images = new HashMap<String, ImageEntry>();
final ByteBuffer text = new ByteBuffer();
final serverCharBuffer authors = new serverCharBuffer();
final CharBuffer authors = new CharBuffer();
final RSSFeed feed = new RSSReader(source).getFeed();
if (feed == null) throw new ParserException("no feed in document",location);

@ -25,7 +25,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.InputStream;
import java.util.HashSet;
@ -34,12 +34,12 @@ import java.util.Set;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
public class rtfParser extends AbstractParser implements Idiom {

@ -25,7 +25,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.ByteArrayInputStream;
import java.io.File;
@ -35,7 +35,13 @@ import java.io.OutputStream;
import java.util.HashSet;
import java.util.Set;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.Parser;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CachedFileOutputStream;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -45,12 +51,6 @@ import SevenZip.MyRandomAccessFile;
import SevenZip.Archive.IInArchive;
import SevenZip.Archive.SevenZipEntry;
import SevenZip.Archive.SevenZip.Handler;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.Parser;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.server.serverCachedFileOutputStream;
public class sevenzipParser extends AbstractParser implements Idiom {
@ -118,7 +118,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
public Document parse(final DigestURI location, final String mimeType, final String charset,
final InputStream source) throws ParserException, InterruptedException {
try {
final serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
final CachedFileOutputStream cfos = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
FileUtils.copy(source, cfos);
if (cfos.isFallback()) {
return parse(location, mimeType, charset, cfos.getContentFile());
@ -144,7 +144,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
private final Log log;
private final long maxRamSize;
private serverCachedFileOutputStream cfos = null;
private CachedFileOutputStream cfos = null;
private final Document doc;
private final String prefix;
@ -228,7 +228,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
throw ex;
}
this.cfos = (item.isDirectory()) ? null
: new serverCachedFileOutputStream(this.maxRamSize, null, true, item.getSize());
: new CachedFileOutputStream(this.maxRamSize, null, true, item.getSize());
return this.cfos;
}

@ -25,7 +25,7 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser;
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
@ -33,13 +33,13 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import pt.tumba.parser.swf.SWF2HTML;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
public class swfParser extends AbstractParser implements Idiom {

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save