Support for indexing of RSS feeds!

- added a scanning in html parser for rss feeds
- storage of rss feed addresses, can be viewed with http://localhost:8080/Tables_p.html?table=rss
- rss items retrieved by http://localhost:8080/Load_RSS_p.html (in Index Creation menu) can be selected and indexed
- a rss feed retrieved in http://localhost:8080/Load_RSS_p.html can now be fully indexed
- indexing of rss feeds can be placed in scheduler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7073 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 0f276dd63f
commit 0010cd9db1

@ -36,11 +36,32 @@
<dl>
<dt><b>URL of the RSS feed</b></dt>
<dd><input type="text" name="url" value="#[url]#" size="60" maxlength="256"/></dd>
<dt>Simulation Mode</dt>
<dt>Preview</dt>
<dd><input type="submit" name="showrss" value="Show RSS Items" /></dd>
<dt>Indexing Mode</dt>
<dd>#(showload)#Available after successful loading of rss feed in simulation mode::
<!--<input type="submit" name="loadrss" value="Index RSS Items" />-->not yet implemented <b>THIS INTERFACE IS A STUB - DEVELOPMENT IS ONGOING</b>
<dt>Indexing</dt>
<dd>#(showload)#Available after successful loading of rss feed in preview::
<input type="hidden" name="url" value="#[rss]#" />
<input type="submit" name="indexAllItemContent" value="Add All Items to Index (full content of url)" />
<dl>
<dt>once<input type="radio" name="repeat" value="off" checked="checked"/></dt>
<dd>load this feed once now</dd>
<dt>scheduled<input type="radio" name="repeat" value="on"/></dt>
<dd>repeat the feed loading every<br/>
<select name="repeat_time">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
<option value="7" selected="selected">7</option>
<option value="8">8</option><option value="9">9</option><option value="10">10</option>
<option value="12">12</option><option value="14">14</option><option value="21">21</option>
<option value="28">28</option><option value="30">30</option>
</select>
<select name="repeat_unit">
<option value="selminutes">minutes</option>
<option value="selhours">hours</option>
<option value="seldays" selected="selected">days</option>
</select> automatically.
</dd>
</dl>
#(/showload)#</dd>
</dl>
@ -49,7 +70,7 @@
#(showitems)#::
<form name="rssfeed"><fieldset>
<legend><label for="table">RSS Feed</label></legend>
<legend><label for="table">RSS Feed of #[rss]#</label></legend>
<dl>
<dt>Title</dt><dd>#[title]#</dd>
<dt>Author</dt><dd>#[author]#</dd>
@ -62,6 +83,7 @@
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name, this.value)" /></td>
<td>State</td>
<td>Title</td>
<td>URL</td>
<td>Author</td>
@ -71,7 +93,8 @@
</tr>
#{item}#
<tr class="TableCellLight">
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[hash]#" /></td>
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[guid]#" /></td>
<td>#(state)#new::enqueued::indexed#(/state)#</td>
<td><a href="#[link]#">#[title]#</a></td>
<td><a href="#[link]#">#[link]#</a></td>
<td>#[author]#</td>
@ -81,7 +104,11 @@
</tr>
#{/item}#
</table>
<p>
<input type="hidden" name="num" value="#[num]#" />
<input type="hidden" name="url" value="#[rss]#" />
<input type="submit" name="indexSelectedItemContent" value="Add Selected Items to Index (full content of url)" /></dt>
</p>
</fieldset></form>
#(/showitems)#

@ -21,23 +21,33 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.DateFormat;
import java.util.Date;
import java.util.Map;
import net.yacy.cora.document.Hit;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ComparableARC;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.WorkTables;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class Load_RSS_p {
private static final ARC<byte[], Date> indexTriggered = new ComparableARC<byte[], Date>(1000, Base64Order.enhancedCoder);
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
@ -51,11 +61,17 @@ public class Load_RSS_p {
prop.put("url", post.get("url", ""));
int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
if (!post.get("repeat", "off").equals("on") && repeat_time > 0) repeat_time = -1;
boolean record_api = false;
DigestURI url = null;
try {
url = post.containsKey("url") ? new DigestURI(post.get("url", ""), null) : null;
} catch (MalformedURLException e) {
Log.logException(e);
Log.logWarning("Load_RSS_p", "url not well-formed: '" + post.get("url", "") + "'");
}
// if we have an url then try to load the rss
@ -69,6 +85,54 @@ public class Load_RSS_p {
Log.logException(e);
}
// index all selected items: description only
if (rss != null && post.containsKey("indexSelectedItemContent")) {
RSSFeed feed = rss.getFeed();
loop: for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) try {
RSSMessage message = feed.getMessage(entry.getValue().substring(5));
DigestURI messageurl = new DigestURI(message.getLink());
if (indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop;
sb.addToIndex(messageurl, null, null);
indexTriggered.put(messageurl.hash(), new Date());
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
}
}
if (rss != null && post.containsKey("indexAllItemContent")) {
record_api = true;
RSSFeed feed = rss.getFeed();
loop: for (RSSMessage message: feed) {
try {
DigestURI messageurl = new DigestURI(message.getLink());
if (indexTriggered.containsKey(messageurl.hash()) && post.containsKey("indexSelectedItemContent")) continue loop;
if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop;
sb.addToIndex(messageurl, null, null);
indexTriggered.put(messageurl.hash(), new Date());
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
}
}
if (record_api) {
// record API action
if (repeat_time > 0) {
// store as scheduled api call
sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false), repeat_time, repeat_unit.substring(3));
} else {
// store just a protocol
sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false));
}
}
// show items from rss
if (rss != null) {
prop.put("showitems", 1);
RSSFeed feed = rss.getFeed();
@ -76,27 +140,30 @@ public class Load_RSS_p {
prop.putHTML("showitems_title", channel.getTitle());
String author = channel.getAuthor();
if (author == null || author.length() == 0) author = channel.getCopyright();
Date pubDate = channel.getPubDate();
prop.putHTML("showitems_author", author == null ? "" : author);
prop.putHTML("showitems_description", channel.getDescription());
prop.putHTML("showitems_language", channel.getLanguage());
prop.putHTML("showitems_date", DateFormat.getDateTimeInstance().format(channel.getPubDate()));
prop.putHTML("showitems_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
prop.putHTML("showitems_ttl", channel.getTTL());
prop.putHTML("showitems_docs", channel.getDocs());
int i = 0;
for (final Hit item: feed) {
try {
url = new DigestURI(item.getLink(), null);
DigestURI messageurl = new DigestURI(item.getLink(), null);
author = item.getAuthor();
if (author == null) author = item.getCopyright();
pubDate = item.getPubDate();
prop.put("showitems_item_" + i + "_count", i);
prop.putHTML("showitems_item_" + i + "_hash", new String(url.hash()));
prop.put("showitems_item_" + i + "_state", sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null ? 2 : indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
prop.putHTML("showitems_item_" + i + "_guid", item.getGuid());
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_link", url.toNormalform(false, false));
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(false, false));
prop.putHTML("showitems_item_" + i + "_description", item.getDescription());
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", DateFormat.getDateTimeInstance().format(item.getPubDate()));
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
i++;
} catch (MalformedURLException e) {
Log.logException(e);
@ -105,7 +172,11 @@ public class Load_RSS_p {
}
prop.put("showitems_item", i);
prop.put("showitems_num", i);
if (i > 0) prop.put("showload", 1);
prop.putHTML("showitems_rss", url.toNormalform(true, false));
if (i > 0) {
prop.put("showload", 1);
prop.put("showload_rss", url.toNormalform(true, false));
}
}
return prop;

@ -28,7 +28,7 @@
<body id="Tables">
#%env/templates/header.template%#
#%env/templates/submenuConfig.template%#
#(showselection)#::
<h2>Table Administration</h2>
<form action="Tables_p.html" method="get">
<fieldset><legend>Table Selection</legend>
@ -60,6 +60,7 @@
</dl>
</fieldset>
</form>
#(/showselection)#
#(showtable)#::
<form action="Tables_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" name="tablelisting">
<fieldset>

@ -39,52 +39,35 @@ public class Tables_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
prop.put("showtable", 0);
prop.put("showedit", 0);
prop.put("showselection", 0);
if (post == null) {
prop.put("pattern", "");
// show table selection
int count = 0;
Iterator<String> ti = sb.tables.tables();
String tablename;
while (ti.hasNext()) {
tablename = ti.next();
prop.put("tables_" + count + "_name", tablename);
prop.put("tables_" + count + "_selected", 0);
count++;
}
prop.put("tables", count);
// generate table
prop.put("showtable", 0);
prop.put("showedit", 0);
// the peer address
prop.put("address", sb.peers.mySeed().getPublicAddress());
// return rewrite properties
return prop;
}
String table = post.get("table", null);
String table = (post == null) ? null : post.get("table", null);
if (table != null && !sb.tables.hasHeap(table)) table = null;
String counts = post.get("count", null);
int maxcount = (counts == null || counts.equals("all")) ? Integer.MAX_VALUE : Integer.parseInt(counts);
String pattern = post.get("search", "");
Pattern matcher = (pattern.length() == 0 || pattern.equals(".*")) ? null : Pattern.compile(".*" + pattern + ".*");
prop.put("pattern", pattern);
// show table selection
int count = 0;
Iterator<String> ti = sb.tables.tables();
String tablename;
prop.put("showselection", 1);
while (ti.hasNext()) {
tablename = ti.next();
prop.put("tables_" + count + "_name", tablename);
prop.put("tables_" + count + "_selected", (table != null && table.equals(tablename)) ? 1 : 0);
prop.put("showselection_tables_" + count + "_name", tablename);
prop.put("showselection_tables_" + count + "_selected", (table != null && table.equals(tablename)) ? 1 : 0);
count++;
}
prop.put("tables", count);
prop.put("showselection_tables", count);
prop.put("showselection_pattern", "");
if (post == null) return prop; // return rewrite properties
String counts = post.get("count", null);
int maxcount = (counts == null || counts.equals("all")) ? Integer.MAX_VALUE : Integer.parseInt(counts);
String pattern = post.get("search", "");
Pattern matcher = (pattern.length() == 0 || pattern.equals(".*")) ? null : Pattern.compile(".*" + pattern + ".*");
prop.put("pattern", pattern);
List<String> columns = null;
if (table != null) try {

@ -36,13 +36,15 @@ public class table_p {
final serverObjects prop = new serverObjects();
String table = (post == null) ? null : post.get("table", null);
if (table != null && !sb.tables.hasHeap(table)) table = null;
if (table == null) {
prop.put("showtable", 0);
return prop;
}
prop.put("showtable", 0);
if (table == null) return prop;
boolean showpk = post.containsKey("pk");
String selectKey = post.containsKey("selectKey") ? post.get("selectKey") : null;
String selectValue = (selectKey != null && post.containsKey("selectValue")) ? post.get("selectValue") : null;
ArrayList<String> columns = null;
try {
columns = sb.tables.columns(table);
@ -86,8 +88,8 @@ public class table_p {
final Iterator<Tables.Row> mapIterator = sb.tables.orderByPK(plainIterator, maxCount).iterator();
Tables.Row trow;
boolean dark = true;
byte[] cell;
while ((mapIterator.hasNext()) && (count < maxCount)) {
String cellName, cellValue;
rowloop: while ((mapIterator.hasNext()) && (count < maxCount)) {
trow = mapIterator.next();
if (row == null) continue;
prop.put("showtable_list_" + count + "_dark", ((dark) ? 1 : 0) ); dark=!dark;
@ -95,9 +97,13 @@ public class table_p {
prop.put("showtable_list_" + count + "_showpk_pk", new String(trow.getPK()));
prop.put("showtable_list_" + count + "_count", count);
for (int i = 0; i < columns.size(); i++) {
cell = trow.get(columns.get(i));
prop.putHTML("showtable_list_" + count + "_columns_" + i + "_column", columns.get(i));
prop.putHTML("showtable_list_" + count + "_columns_" + i + "_cell", cell == null ? "" : new String(cell));
cellName = columns.get(i);
cellValue = new String(trow.get(cellName));
if (selectKey != null && cellName.equals(selectKey) && !cellValue.matches(selectValue)) {
continue rowloop;
}
prop.putHTML("showtable_list_" + count + "_columns_" + i + "_column", cellName);
prop.putHTML("showtable_list_" + count + "_columns_" + i + "_cell", cellValue);
}
prop.put("showtable_list_" + count + "_columns", columns.size());

@ -361,7 +361,7 @@ public class Segment {
document.outboundLinks(), // outbound links
searchEvent // a search event that can have results directly
);
final long indexingEndTime = System.currentTimeMillis();
if (log.isInfo()) {

@ -1903,6 +1903,23 @@ public final class Switchboard extends serverSwitch {
return;
}
// store rss feeds in document into rss table
for (Map.Entry<MultiProtocolURI, String> rssEntry : document.getRSS().entrySet()) {
Tables.Data rssRow = new Tables.Data();
rssRow.put("referrer", queueEntry.url().hash());
rssRow.put("url", rssEntry.getKey().toNormalform(true, false).getBytes());
rssRow.put("title", rssEntry.getValue().getBytes());
rssRow.put("recording_date", new Date());
//rssRow.put("last_load_date", "".getBytes());
//rssRow.put("last_load_count", "".getBytes());
//rssRow.put("avg_upd_per_day", "".getBytes());
try {
this.tables.update("rss", new DigestURI(rssEntry.getKey()).hash(), rssRow);
} catch (IOException e) {
Log.logException(e);
}
}
// update url result list statistics
crawlResults.stack(
newEntry, // loaded url db entry
@ -1970,17 +1987,14 @@ public final class Switchboard extends serverSwitch {
*/
public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) throws IOException, Parser.Failure {
final Segments.Process process = Segments.Process.LOCALCRAWLING;
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) {
searchEvent.addHeuristic(url.hash(), heuristicName, true);
return; // don't do double-work
}
if (searchEvent != null) searchEvent.addHeuristic(url.hash(), heuristicName, true);
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
final Request request = loader.request(url, true, true);
String acceptedError = this.crawlStacker.checkAcceptance(url, this.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), 0);
if (acceptedError != null) {
log.logInfo("Heuristic: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
return;
}
searchEvent.addHeuristic(url.hash(), heuristicName, false);
new Thread() {public void run() {
try {
Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE);
@ -1994,12 +2008,12 @@ public final class Switchboard extends serverSwitch {
ResultImages.registerImages(url, document, true);
webStructure.generateCitationReference(url, document, condenser, response.lastModified());
storeDocumentIndex(process, response, document, condenser, searchEvent);
log.logInfo("heuristic fill of url " + url.toNormalform(true, true) + " finished");
log.logInfo("addToIndex fill of url " + url.toNormalform(true, true) + " finished");
}
} catch (IOException e) {
//Log.logException(e);
log.logWarning("addToIndex: failed loading " + url.toNormalform(false, false) + ": " + e.getMessage());
} catch (Parser.Failure e) {
//Log.logException(e);
log.logWarning("addToIndex: failed parsing " + url.toNormalform(false, false) + ": " + e.getMessage());
}
}}.start();
}

@ -26,7 +26,7 @@ import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
public class RSSFeed implements Iterable<Hit> {
public class RSSFeed implements Iterable<RSSMessage> {
public static final int DEFAULT_MAXSIZE = 1000;
@ -78,7 +78,7 @@ public class RSSFeed implements Iterable<Hit> {
return messages.size();
}
public Iterator<Hit> iterator() {
public Iterator<RSSMessage> iterator() {
return new messageIterator();
}
@ -92,7 +92,7 @@ public class RSSFeed implements Iterable<Hit> {
}
}
public class messageIterator implements Iterator<Hit>{
public class messageIterator implements Iterator<RSSMessage>{
Iterator<String> GUIDiterator;
String lastGUID;

@ -33,9 +33,6 @@ import net.yacy.kelondro.util.DateFormatter;
public class RSSMessage implements Hit {
// statics for item generation and automatic categorization
private static int guidcount = 0;
public static enum Token {
title("title"),
@ -73,7 +70,8 @@ public class RSSMessage implements Hit {
return this.keys;
}
}
private static String artificialGuidPrefix = "c0_";
public static final RSSMessage POISON = new RSSMessage("", "", "");
public static final HashSet<String> tags = new HashSet<String>();
@ -86,21 +84,25 @@ public class RSSMessage implements Hit {
private final Map<String, String> map;
public RSSMessage(final String title, final String description, final String link) {
this();
setValue("title", title);
setValue("description", description);
setValue("link", link);
setValue("pubDate", DateFormatter.formatShortSecond(new Date()));
setValue("guid", Integer.toHexString((title + description + link).hashCode()));
this.map = new ConcurrentHashMap<String, String>();
map.put("title", title);
map.put("description", description);
map.put("link", link);
map.put("pubDate", DateFormatter.formatShortSecond(new Date()));
map.put("guid", artificialGuidPrefix + Integer.toHexString((title + description + link).hashCode()));
}
public RSSMessage() {
this.map = new ConcurrentHashMap<String, String>();
this.map.put("guid", Long.toHexString(System.currentTimeMillis()) + ":" + guidcount++);
}
public void setValue(final String name, final String value) {
map.put(name, value);
// if possible generate a guid if not existent so far
if ((name.equals("title") || name.equals("description") || name.equals("link")) &&
(!map.containsKey("guid") || map.get("guid").startsWith(artificialGuidPrefix))) {
map.put("guid", artificialGuidPrefix + Integer.toHexString((getTitle() + getDescription() + getLink()).hashCode()));
}
}
public String getTitle() {

@ -67,6 +67,7 @@ public class Document {
private final StringBuilder description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private final Map<MultiProtocolURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final Map<MultiProtocolURI, String> rss; // all embedded rss feeds
private final HashMap<MultiProtocolURI, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
@ -83,7 +84,10 @@ public class Document {
public Document(final MultiProtocolURI location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author, final String publisher,
final String[] sections, final String abstrct,
final Object text, final Map<MultiProtocolURI, String> anchors, final HashMap<MultiProtocolURI, ImageEntry> images,
final Object text,
final Map<MultiProtocolURI, String> anchors,
final Map<MultiProtocolURI, String> rss,
final HashMap<MultiProtocolURI, ImageEntry> images,
boolean indexingDenied) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
@ -94,6 +98,7 @@ public class Document {
this.sections = (sections == null) ? new LinkedList<String>() : Arrays.asList(sections);
this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct);
this.anchors = (anchors == null) ? new HashMap<MultiProtocolURI, String>(0) : anchors;
this.rss = (rss == null) ? new HashMap<MultiProtocolURI, String>(0) : rss;
this.images = (images == null) ? new HashMap<MultiProtocolURI, ImageEntry>() : images;
this.publisher = publisher;
this.hyperlinks = null;
@ -287,6 +292,12 @@ dc_rights
return anchors;
}
public Map<MultiProtocolURI, String> getRSS() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return rss;
}
// the next three methods provide a calculated view on the getAnchors/getImages:
@ -504,8 +515,9 @@ dc_rights
this.text = new ByteArrayOutputStream();
}
FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text);
anchors.putAll(doc.getAnchors());
rss.putAll(doc.getRSS());
ContentScraper.addAllImages(images, doc.getImages());
}
}
@ -618,8 +630,9 @@ dc_rights
final StringBuilder title = new StringBuilder();
final StringBuilder description = new StringBuilder();
final LinkedList<String> sectionTitles = new LinkedList<String>();
final Map<MultiProtocolURI, String> anchors = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, String> rss = new HashMap<MultiProtocolURI, String>();
final HashMap<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
for (Document doc: docs) {
@ -659,6 +672,7 @@ dc_rights
}
}
anchors.putAll(doc.getAnchors());
rss.putAll(doc.getRSS());
ContentScraper.addAllImages(images, doc.getImages());
}
return new Document(
@ -674,6 +688,7 @@ dc_rights
description.toString(),
content.getBytes(),
anchors,
rss,
images,
false);
}

@ -258,6 +258,7 @@ public class DCEntry extends TreeMap<String, String> {
getDescription().getBytes("UTF-8"),
null,
null,
null,
false);
} catch (UnsupportedEncodingException e) {
Log.logException(e);

@ -68,6 +68,7 @@ public class csvParser extends AbstractParser implements Parser {
sb.toString().getBytes(charset),
null,
null,
null,
false)};
} catch (UnsupportedEncodingException e) {
throw new Parser.Failure("error in csvParser, getBytes: " + e.getMessage(), location);

@ -98,6 +98,7 @@ public class docParser extends AbstractParser implements Parser {
contents.toString().getBytes("UTF-8"),
null,
null,
null,
false)};
} catch (UnsupportedEncodingException e) {
throw new Parser.Failure("error in docParser, getBytes: " + e.getMessage(), location);

@ -79,6 +79,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
// class variables: collectors for links
private HashMap<MultiProtocolURI, String> rss;
private HashMap<MultiProtocolURI, String> anchors;
private HashMap<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final HashMap<String, String> metas;
@ -104,6 +105,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// it is only the reference for relative links
super(linkTags0, linkTags1);
this.root = root;
this.rss = new HashMap<MultiProtocolURI, String>();
this.anchors = new HashMap<MultiProtocolURI, String>();
this.images = new HashMap<MultiProtocolURI, ImageEntry>();
this.metas = new HashMap<String, String>();
@ -188,14 +190,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", ""));
if (newLink != null) {
final String type = tagopts.getProperty("rel", "");
final String rel = tagopts.getProperty("rel", "");
final String linktitle = tagopts.getProperty("title", "");
final String type = tagopts.getProperty("type", "");
if (type.equalsIgnoreCase("shortcut icon")) {
if (rel.equalsIgnoreCase("shortcut icon")) {
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
images.put(ie.url(), ie);
this.favicon = newLink;
} else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) {
} else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
rss.put(newLink, linktitle);
} else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
anchors.put(newLink, linktitle);
}
}
@ -355,6 +360,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return anchors;
}
public Map<MultiProtocolURI, String> getRSS() {
// returns a url (String) / name (String) relation
return rss;
}
/**
* get all images
* @return a map of <urlhash, ImageEntry>

@ -142,6 +142,7 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getDescription(),
scraper.getText(),
scraper.getAnchors(),
scraper.getRSS(),
scraper.getImages(),
scraper.indexingDenied())};
//scraper.close();

@ -199,6 +199,7 @@ public class genericImageParser extends AbstractParser implements Parser {
description == null ? "" : description, // description
infoString.getBytes(), // content text
anchors, // anchors
null,
images,
false)}; // images
}

@ -169,6 +169,7 @@ public class odtParser extends AbstractParser implements Parser {
contentBytes,
null,
null,
null,
false)};
return docs;
} catch (final Exception e) {

@ -158,6 +158,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
contentBytes,
null,
null,
null,
false)};
return docs;
} catch (final Exception e) {

@ -157,6 +157,7 @@ public class pdfParser extends AbstractParser implements Parser {
contentBytes,
null,
null,
null,
false)};
}

@ -95,6 +95,7 @@ public class pptParser extends AbstractParser implements Parser {
contents.getBytes("UTF-8"),
null,
null,
null,
false)};
return docs;
} catch (final Exception e) {

@ -111,6 +111,7 @@ public class psParser extends AbstractParser implements Parser {
null, // abstract
outputFile, // fulltext
null, // anchors
null, // rss
null, // images
false)}; // indexingdenied

@ -87,6 +87,7 @@ public class rssParser extends AbstractParser implements Parser {
item.getDescription(),
null,
anchors,
null,
new HashMap<MultiProtocolURI, ImageEntry>(),
false);
docs.add(doc);

@ -79,6 +79,7 @@ public class rtfParser extends AbstractParser implements Parser {
bodyText.getBytes("UTF-8"),
null,
null,
null,
false)};
}
catch (final Exception e) {

@ -56,7 +56,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
}
public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException {
final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, null, (Object)null, null, null, false);
final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, null, (Object)null, null, null, null, false);
Handler archive;
super.log.logFine("opening 7zip archive...");
try {

@ -65,7 +65,6 @@ public class swfParser extends AbstractParser implements Parser {
Log.logException(e);
throw new Parser.Failure(e.getMessage(), location);
} catch (IOException e) {
Log.logException(e);
throw new Parser.Failure(e.getMessage(), location);
} catch (Exception e) {
Log.logException(e);
@ -117,6 +116,7 @@ public class swfParser extends AbstractParser implements Parser {
contents.getBytes("UTF-8"), // the parsed document text
anchors, // a map of extracted anchors
null,
null,
false)}; // a treeset of image URLs
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -97,6 +97,7 @@ public class torrentParser extends AbstractParser implements Parser {
filenames.toString().getBytes(charset),
null,
null,
null,
false)};
} catch (UnsupportedEncodingException e) {
throw new Parser.Failure("error in torrentParser, getBytes: " + e.getMessage(), location);

@ -212,6 +212,7 @@ public class vcfParser extends AbstractParser implements Parser {
"vCard", // an abstract
text, // the parsed document text
anchors, // a map of extracted anchors
null,
null, // a treeset of image URLs
false)};
} catch (final Exception e) {

@ -113,6 +113,7 @@ public class vsdParser extends AbstractParser implements Parser {
abstrct, // an abstract
contents.getBytes("UTF-8"), // the parsed document text
null, // a map of extracted anchors
null,
null, // a treeset of image URLs
false)};
} catch (final Exception e) {

@ -125,6 +125,7 @@ public class xlsParser extends AbstractParser implements Parser {
contents.getBytes("UTF-8"),
null,
null,
null,
false)};
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -362,7 +362,7 @@ public class Tables {
}
public class Data extends LinkedHashMap<String, byte[]> {
public static class Data extends LinkedHashMap<String, byte[]> {
private static final long serialVersionUID = 978426054043749337L;

Loading…
Cancel
Save