You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
255 lines
13 KiB
255 lines
13 KiB
import java.io.BufferedReader;
|
|
import java.io.ByteArrayInputStream;
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.net.MalformedURLException;
|
|
import java.nio.charset.StandardCharsets;
|
|
import java.util.Iterator;
|
|
import java.util.concurrent.ArrayBlockingQueue;
|
|
import java.util.regex.Pattern;
|
|
import java.util.zip.GZIPInputStream;
|
|
|
|
import org.xml.sax.SAXException;
|
|
|
|
import net.yacy.cora.document.encoding.UTF8;
|
|
import net.yacy.cora.protocol.ClientIdentification;
|
|
import net.yacy.cora.protocol.RequestHeader;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.data.BookmarksDB.Bookmark;
|
|
import net.yacy.data.UserDB;
|
|
import net.yacy.data.WorkTables;
|
|
import net.yacy.data.ymark.MonitoredReader;
|
|
import net.yacy.data.ymark.YMarkAutoTagger;
|
|
import net.yacy.data.ymark.YMarkCrawlStart;
|
|
import net.yacy.data.ymark.YMarkDMOZImporter;
|
|
import net.yacy.data.ymark.YMarkEntry;
|
|
import net.yacy.data.ymark.YMarkHTMLImporter;
|
|
import net.yacy.data.ymark.YMarkJSONImporter;
|
|
import net.yacy.data.ymark.YMarkTables;
|
|
import net.yacy.data.ymark.YMarkUtil;
|
|
import net.yacy.data.ymark.YMarkXBELImporter;
|
|
import net.yacy.document.Parser.Failure;
|
|
import net.yacy.kelondro.blob.Tables;
|
|
import net.yacy.kelondro.workflow.OneTimeBusyThread;
|
|
import net.yacy.search.Switchboard;
|
|
import net.yacy.server.serverObjects;
|
|
import net.yacy.server.serverSwitch;
|
|
|
|
public class import_ymark {
|
|
|
|
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
|
final Switchboard sb = (Switchboard) env;
|
|
final serverObjects prop = new serverObjects();
|
|
final UserDB.Entry user = sb.userDB.getUser(header);
|
|
final boolean isAdmin = (sb.verifyAuthentication(header));
|
|
final boolean isAuthUser = user!= null && user.hasRight(UserDB.AccessRight.BOOKMARK_RIGHT);
|
|
final int queueSize = 200;
|
|
|
|
YMarkEntry bmk;
|
|
// String root = YMarkEntry.FOLDERS_IMPORTED;
|
|
String root = "";
|
|
ByteArrayInputStream stream = null;
|
|
|
|
if(isAdmin || isAuthUser) {
|
|
String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN);
|
|
final ArrayBlockingQueue<String> autoTaggingQueue = new ArrayBlockingQueue<String>(10*queueSize);
|
|
boolean autotag = false;
|
|
boolean merge = false;
|
|
boolean empty = false;
|
|
final String indexing = post.get("indexing", "off");
|
|
final boolean medialink = post.getBoolean("medialink");
|
|
|
|
if(post.containsKey("autotag") && !post.get("autotag", "off").equals("off")) {
|
|
autotag = true;
|
|
if(post.get("autotag").equals("merge")) {
|
|
merge = true;
|
|
}
|
|
if(post.get("autotag").equals("empty")) {
|
|
empty = true;
|
|
}
|
|
YMarkAutoTagger autoTagger = new YMarkAutoTagger(autoTaggingQueue, sb.loader, sb.tables.bookmarks, bmk_user, merge);
|
|
OneTimeBusyThread.startFromRunnable(autoTagger, 0);
|
|
}
|
|
|
|
if(isAdmin && post.containsKey("table") && post.get("table").length() > 0) {
|
|
bmk_user = post.get("table").substring(0, post.get("table").indexOf('_',0));
|
|
}
|
|
if(post.containsKey("redirect") && post.get("redirect").length() > 0) {
|
|
prop.put("redirect_url", post.get("redirect"));
|
|
prop.put("redirect", "1");
|
|
}
|
|
if(post.containsKey("root") && post.get("root").length() > 0) {
|
|
root = post.get("root");
|
|
}
|
|
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
|
|
if(post.containsKey("bmkfile") && !post.get("bmkfile").isEmpty() && post.containsKey("importer")){
|
|
final byte[] bytes = UTF8.getBytes(post.get("bmkfile$file"));
|
|
stream = new ByteArrayInputStream(bytes);
|
|
if(post.get("importer").equals("surro") && stream != null) {
|
|
/**
|
|
SurrogateReader surrogateReader;
|
|
try {
|
|
surrogateReader = new SurrogateReader(stream, queueSize, sb.crawlStacker, sb.index.fulltext().getDefaultConfiguration());
|
|
} catch (final IOException e) {
|
|
//TODO: display an error message
|
|
ConcurrentLog.logException(e);
|
|
prop.put("status", "0");
|
|
return prop;
|
|
}
|
|
InstantBusyThread.oneTimeJob(surrogateReader, 0);
|
|
while ((bmk = new YMarkEntry(surrogateReader.take())) != YMarkEntry.POISON) {
|
|
putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
|
|
}
|
|
prop.put("status", "1");
|
|
*/
|
|
} else {
|
|
MonitoredReader reader = new MonitoredReader(new InputStreamReader(stream, StandardCharsets.UTF_8), 1024*16, bytes.length);
|
|
if(post.get("importer").equals("html") && reader != null) {
|
|
final YMarkHTMLImporter htmlImporter = new YMarkHTMLImporter(reader, queueSize, root);
|
|
OneTimeBusyThread.startFromRunnable(htmlImporter, 0);
|
|
OneTimeBusyThread.startFromRunnable(htmlImporter.getConsumer(sb, bmk_user, autoTaggingQueue, autotag, empty, indexing, medialink), 0);
|
|
prop.put("status", "1");
|
|
} else if(post.get("importer").equals("xbel") && reader != null) {
|
|
final YMarkXBELImporter xbelImporter;
|
|
try {
|
|
//TODO: make RootFold
|
|
xbelImporter = new YMarkXBELImporter(reader, queueSize, root);
|
|
} catch (final SAXException e) {
|
|
//TODO: display an error message
|
|
ConcurrentLog.logException(e);
|
|
prop.put("status", "0");
|
|
return prop;
|
|
}
|
|
OneTimeBusyThread.startFromRunnable(xbelImporter, 0);
|
|
OneTimeBusyThread.startFromRunnable(xbelImporter.getConsumer(sb, bmk_user, autoTaggingQueue, autotag, empty, indexing, medialink), 0);
|
|
prop.put("status", "1");
|
|
} else if(post.get("importer").equals("json") && reader != null) {
|
|
YMarkJSONImporter jsonImporter;
|
|
jsonImporter = new YMarkJSONImporter(reader, queueSize, root);
|
|
OneTimeBusyThread.startFromRunnable(jsonImporter, 0);
|
|
while ((bmk = jsonImporter.take()) != YMarkEntry.POISON) {
|
|
putBookmark(sb, bmk_user, bmk, autoTaggingQueue, autotag, empty, indexing, medialink);
|
|
}
|
|
prop.put("status", "1");
|
|
}
|
|
}
|
|
} else if(post.containsKey("importer") && post.get("importer").equals("crawls")) {
|
|
if(!isAdmin) {
|
|
prop.authenticationRequired();
|
|
return prop;
|
|
}
|
|
try {
|
|
final Pattern pattern = Pattern.compile("^crawl start for.*");
|
|
final Iterator<Tables.Row> APIcalls = sb.tables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_COMMENT, pattern);
|
|
Tables.Row row = null;
|
|
while(APIcalls.hasNext()) {
|
|
row = APIcalls.next();
|
|
if(row.get(WorkTables.TABLE_API_COL_TYPE, "").equals("crawler")) {
|
|
final String url = row.get(WorkTables.TABLE_API_COL_COMMENT, "").substring(16);
|
|
sb.tables.bookmarks.createBookmark(sb.loader, url, agent, bmk_user, autotag, "crawlStart", "/Crawl Start");
|
|
}
|
|
}
|
|
prop.put("status", "1");
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
} catch (final Failure e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
} else if(post.containsKey("importer") && post.get("importer").equals("bmks")) {
|
|
if(!isAdmin) {
|
|
prop.authenticationRequired();
|
|
return prop;
|
|
}
|
|
final Iterator<String> bit=sb.bookmarksDB.getBookmarksIterator(isAdmin);
|
|
while(bit.hasNext()){
|
|
Bookmark bookmark=sb.bookmarksDB.getBookmark(bit.next());
|
|
if (bookmark != null) {
|
|
final YMarkEntry bmk_entry = new YMarkEntry(false);
|
|
bmk_entry.put(YMarkEntry.BOOKMARK.URL.key(), bookmark.getUrl());
|
|
try {
|
|
if(!sb.tables.has(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user), YMarkUtil.getBookmarkId(bookmark.getUrl()))) {
|
|
bmk_entry.put(YMarkEntry.BOOKMARK.PUBLIC.key(), bookmark.getPublic() ? "true" : "false");
|
|
bmk_entry.put(YMarkEntry.BOOKMARK.TITLE.key(), bookmark.getTitle());
|
|
bmk_entry.put(YMarkEntry.BOOKMARK.DESC.key(), bookmark.getDescription());
|
|
bmk_entry.put(YMarkEntry.BOOKMARK.TAGS.key(), bookmark.getTagsString());
|
|
bmk_entry.put(YMarkEntry.BOOKMARK.FOLDERS.key(), root+bookmark.getFoldersString().replaceAll(".*"+YMarkUtil.TAGS_SEPARATOR+YMarkUtil.FOLDERS_SEPARATOR, root+YMarkUtil.FOLDERS_SEPARATOR));
|
|
}
|
|
if(autotag) {
|
|
bmk_entry.put(YMarkEntry.BOOKMARK.TAGS.key(), YMarkAutoTagger.autoTag(bookmark.getUrl(), sb.loader, agent, 3, sb.tables.bookmarks.getTags(bmk_user)));
|
|
}
|
|
sb.tables.bookmarks.addBookmark(bmk_user, bmk_entry, merge, true);
|
|
prop.put("status", "1");
|
|
} catch (final MalformedURLException e) {
|
|
ConcurrentLog.logException(e);
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
}
|
|
}
|
|
} else if(post.containsKey("importer") && post.get("importer").equals("dmoz")) {
|
|
if(!isAdmin) {
|
|
prop.authenticationRequired();
|
|
return prop;
|
|
}
|
|
try {
|
|
final File in = new File(sb.workPath, "content.rdf.u8.gz");
|
|
final InputStream gzip = new FileInputStream(in);
|
|
final InputStream content = new GZIPInputStream(gzip);
|
|
final InputStreamReader reader = new InputStreamReader(content, StandardCharsets.UTF_8);
|
|
final BufferedReader breader = new BufferedReader(reader);
|
|
final MonitoredReader mreader = new MonitoredReader(breader, 1024*1024, in.length());
|
|
|
|
final String source = post.get("source", "");
|
|
final YMarkDMOZImporter DMOZImporter = new YMarkDMOZImporter(mreader, queueSize, root, source);
|
|
|
|
mreader.addChangeListener(sb.tables.bookmarks.getProgressListener("DMOZImporter"));
|
|
DMOZImporter.setDepth(6);
|
|
OneTimeBusyThread.startFromRunnable(DMOZImporter, 0);
|
|
OneTimeBusyThread.startFromRunnable(DMOZImporter.getConsumer(sb, bmk_user, autoTaggingQueue, autotag, empty, indexing, medialink), 0);
|
|
|
|
prop.put("status", "1");
|
|
} catch (final Exception e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
}
|
|
} else {
|
|
prop.put(serverObjects.ACTION_AUTHENTICATE, YMarkTables.USER_AUTHENTICATE_MSG);
|
|
}
|
|
// return rewrite properties
|
|
return prop;
|
|
}
|
|
|
|
public static void putBookmark(final Switchboard sb, final String bmk_user, final YMarkEntry bmk,
|
|
final ArrayBlockingQueue<String> autoTaggingQueue, final boolean autotag, final boolean empty, final String indexing, final boolean medialink) {
|
|
try {
|
|
final String url = bmk.get(YMarkEntry.BOOKMARK.URL.key());
|
|
// other protocols could cause problems
|
|
if(url != null && url.startsWith("http")) {
|
|
sb.tables.bookmarks.addBookmark(bmk_user, bmk, true, true);
|
|
if(autotag) {
|
|
if(!empty) {
|
|
autoTaggingQueue.put(url);
|
|
} else if(!bmk.containsKey(YMarkEntry.BOOKMARK.TAGS.key()) || bmk.get(YMarkEntry.BOOKMARK.TAGS.key()).equals(YMarkEntry.BOOKMARK.TAGS.deflt())) {
|
|
autoTaggingQueue.put(url);
|
|
}
|
|
}
|
|
// fill crawler
|
|
if (indexing.equals("single")) {
|
|
bmk.crawl(YMarkCrawlStart.CRAWLSTART.SINGLE, medialink, sb);
|
|
} else if (indexing.equals("onelink")) {
|
|
bmk.crawl(YMarkCrawlStart.CRAWLSTART.ONE_LINK, medialink, sb);
|
|
} else if (indexing.equals("fulldomain")) {
|
|
bmk.crawl(YMarkCrawlStart.CRAWLSTART.FULL_DOMAIN, medialink, sb);
|
|
}
|
|
}
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
} catch (final InterruptedException e) {
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
}
|
|
}
|