You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/data/ymark/YMarkImporter.java

157 lines
5.6 KiB

// YMarkImporter.java
// (C) 2012 by Stefan Foerster (apfelmaennchen), sof@gmx.de, Norderstedt, Germany
// first published 2012 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.data.ymark;
import java.io.IOException;
import java.util.concurrent.ArrayBlockingQueue;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.Switchboard;
public abstract class YMarkImporter implements Runnable {
// Statics
public final static String XML_NAMESPACE_PREFIXES = "http://xml.org/sax/features/namespace-prefixes";
public final static String XML_NAMESPACES = "http://xml.org/sax/features/namespaces";
public final static String XML_VALIDATION = "http://xml.org/sax/features/validation";
protected String importer;
protected ArrayBlockingQueue<YMarkEntry> bookmarks;
protected final MonitoredReader bmk_file;
protected final String targetFolder;
protected final String sourceFolder;
public YMarkImporter(final MonitoredReader bmk_file, final int queueSize, final String sourceFolder, final String targetFolder) {
this.bookmarks = new ArrayBlockingQueue<YMarkEntry>(queueSize);
this.bmk_file = bmk_file;
this.sourceFolder = YMarkUtil.cleanFoldersString(sourceFolder);
this.targetFolder = YMarkUtil.cleanFoldersString(targetFolder);
}
@Override
public void run() {
try {
parse();
} catch (final Exception e) {
ConcurrentLog.logException(e);
} finally {
try {
ConcurrentLog.info(YMarkTables.BOOKMARKS_LOG, this.importer+" Importer inserted poison pill in queue");
this.bookmarks.put(YMarkEntry.POISON);
} catch (final InterruptedException e1) {
ConcurrentLog.logException(e1);
}
}
}
public YMarkEntry take() {
try {
return this.bookmarks.take();
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
return null;
}
}
public void setImporter(final String importer) {
this.importer = importer;
}
public long getProgress() {
return this.bmk_file.getProgress();
}
public long maxProgress() {
return this.bmk_file.maxProgress();
}
public abstract void parse() throws Exception;
public Consumer getConsumer(final Switchboard sb, final String bmk_user, final ArrayBlockingQueue<String> autoTaggingQueue,
final boolean autotag, final boolean empty, final String indexing, final boolean medialink) {
return new Consumer(sb, bmk_user, autoTaggingQueue, autotag, empty, indexing, medialink);
}
public class Consumer implements Runnable {
private final Switchboard sb;
private final String bmk_user;
private final ArrayBlockingQueue<String> autoTaggingQueue;
private final String indexing;
private final boolean autotag;
private final boolean empty;
private final boolean medialink;
public Consumer(final Switchboard sb, final String bmk_user, final ArrayBlockingQueue<String> autoTaggingQueue,
final boolean autotag, final boolean empty, final String indexing, final boolean medialink) {
this.sb = sb;
this.bmk_user = bmk_user;
this.autoTaggingQueue = autoTaggingQueue;
this.autotag = autotag;
this.empty = empty;
this.indexing = indexing;
this.medialink = medialink;
}
@Override
public void run() {
YMarkEntry bmk;
while ((bmk = take()) != YMarkEntry.POISON) {
try {
final String url = bmk.get(YMarkEntry.BOOKMARK.URL.key());
// other protocols could cause problems
if(url != null && url.startsWith("http")) {
this.sb.tables.bookmarks.addBookmark(this.bmk_user, bmk, true, true);
if(this.autotag) {
if(!this.empty) {
this.autoTaggingQueue.put(url);
} else if(!bmk.containsKey(YMarkEntry.BOOKMARK.TAGS.key()) || bmk.get(YMarkEntry.BOOKMARK.TAGS.key()).equals(YMarkEntry.BOOKMARK.TAGS.deflt())) {
this.autoTaggingQueue.put(url);
}
}
// fill crawler
if (this.indexing.equals("single")) {
bmk.crawl(YMarkCrawlStart.CRAWLSTART.SINGLE, this.medialink, this.sb);
} else if (this.indexing.equals("onelink")) {
bmk.crawl(YMarkCrawlStart.CRAWLSTART.ONE_LINK, this.medialink, this.sb);
} else if (this.indexing.equals("fulldomain")) {
bmk.crawl(YMarkCrawlStart.CRAWLSTART.FULL_DOMAIN, this.medialink, this.sb);
}
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
}
}
if(this.autotag) {
try {
this.autoTaggingQueue.put(YMarkAutoTagger.POISON);
ConcurrentLog.info(YMarkTables.BOOKMARKS_LOG, YMarkImporter.this.importer+" inserted poison pill into autoTagging queue");
} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
}
}
}
}
}