- yacy can import phpbb3 posts without crawling - all data is written as surrogate - indexed surrogate files can be re-used git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5985 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
f1a9253baa
commit
4b4bddca00
@ -0,0 +1,83 @@
|
|||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head>
|
||||||
|
<title>YaCy '#[clientname]#': Content Integration: Retrieval from phpBB3 Databases</title>
|
||||||
|
#%env/templates/metas.template%#
|
||||||
|
</head>
|
||||||
|
<body id="ContentIntegrationPHPBB3">
|
||||||
|
#%env/templates/header.template%#
|
||||||
|
#%env/templates/submenuIndexCreate.template%#
|
||||||
|
<h2>Content Integration: Retrieval from phpBB3 Databases</h2>
|
||||||
|
<p>
|
||||||
|
It is possible to extract texts directly from mySQL and postgreSQL databases.
|
||||||
|
Each extraction is specific to the data that is hosted in the database.
|
||||||
|
This interface gives you access to the phpBB3 forums software content.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
When a export is started, surrogate files are generated into DATA/SURROGATE/in which are automatically fetched by an indexer thread.
|
||||||
|
All indexed surrogate files are then moved to DATA/SURROGATE/out and can be re-cycled when a index is deleted.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<form action="ContentIntegrationPHPBB3_p.html">
|
||||||
|
<fieldset>
|
||||||
|
<dl>
|
||||||
|
<dt><b>The URL stub</b>,<br />like http://forum.yacy-websuche.de<br />this must be the path right in front of '/viewtopic.php?'</dt>
|
||||||
|
<dd><input type="text" name="content.phpbb3.urlstub" value="#[content.phpbb3.urlstub]#" size="60" /></dd>
|
||||||
|
|
||||||
|
<dt><b>Type</b> of database<br />(use either 'mysql' or 'pgsql')</dt>
|
||||||
|
<dd><input type="text" name="content.phpbb3.dbtype" value="#[content.phpbb3.dbtype]#" size="6" /></dd>
|
||||||
|
|
||||||
|
<dt><b>Host</b> of the database</dt>
|
||||||
|
<dd><input type="text" name="content.phpbb3.dbhost" value="#[content.phpbb3.dbhost]#" size="40" /></dd>
|
||||||
|
|
||||||
|
<dt><b>Port</b> of database service<br />(usually 3306 for mySQL)</dt>
|
||||||
|
<dd><input type="text" name="content.phpbb3.dbport" value="#[content.phpbb3.dbport]#" size="6" /></dd>
|
||||||
|
|
||||||
|
<dt><b>Name of the database</b> on the host</dt>
|
||||||
|
<dd><input type="text" name="content.phpbb3.dbname" value="#[content.phpbb3.dbname]#" size="20" /></dd>
|
||||||
|
|
||||||
|
<dt><b>User</b> that can access the database</dt>
|
||||||
|
<dd><input type="text" name="content.phpbb3.dbuser" value="#[content.phpbb3.dbuser]#" size="20" /></dd>
|
||||||
|
|
||||||
|
<dt><b>Password</b> for the account of that user given above</dt>
|
||||||
|
<dd><input type="text" name="content.phpbb3.dbpw" value="#[content.phpbb3.dbpw]#" size="20" /></dd>
|
||||||
|
|
||||||
|
<dt><b>Posts per file</b><br />in exported surrogates</dt>
|
||||||
|
<dd><input type="text" name="content.phpbb3.ppf" value="#[content.phpbb3.ppf]#" size="20" /></dd>
|
||||||
|
|
||||||
|
<dt></dt>
|
||||||
|
<dd>
|
||||||
|
<input type="submit" name="check" value="Check database connection" />
|
||||||
|
<input type="submit" name="export" value="Export Content to Surrogates" />
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</fieldset>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
#(check)#::
|
||||||
|
<form>
|
||||||
|
<fieldset>
|
||||||
|
<dl>
|
||||||
|
<dt>Posts in database</dt>
|
||||||
|
<dd>#[posts]#</dd>
|
||||||
|
|
||||||
|
<dt>first entry</dt>
|
||||||
|
<dd>#[first]#</dd>
|
||||||
|
|
||||||
|
<dt>last entry</dt>
|
||||||
|
<dd>#[last]#</dd>
|
||||||
|
|
||||||
|
</dl>
|
||||||
|
</fieldset>
|
||||||
|
</form>::
|
||||||
|
<p>Info failed: #[error]#</p>
|
||||||
|
#(/check)#
|
||||||
|
|
||||||
|
#(export)#::
|
||||||
|
<p>Export successful! Wrote #[files]# files in DATA/SURROGATES/in</p>::
|
||||||
|
<p>Export failed: #[error]#</p>
|
||||||
|
#(/export)#
|
||||||
|
|
||||||
|
#%env/templates/footer.template%#
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -0,0 +1,123 @@
|
|||||||
|
// ContentIntegrationPHPBB3_p.java
|
||||||
|
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||||
|
// first published 27.05.2009 on http://yacy.net
|
||||||
|
//
|
||||||
|
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||||
|
// $LastChangedRevision: 1986 $
|
||||||
|
// $LastChangedBy: orbiter $
|
||||||
|
//
|
||||||
|
// LICENSE
|
||||||
|
//
|
||||||
|
// This program is free software; you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation; either version 2 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program; if not, write to the Free Software
|
||||||
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
|
import de.anomic.content.dao.Dao;
|
||||||
|
import de.anomic.content.dao.PhpBB3Dao;
|
||||||
|
import de.anomic.http.httpRequestHeader;
|
||||||
|
import de.anomic.kelondro.util.DateFormatter;
|
||||||
|
import de.anomic.plasma.plasmaSwitchboard;
|
||||||
|
import de.anomic.server.serverObjects;
|
||||||
|
import de.anomic.server.serverSwitch;
|
||||||
|
|
||||||
|
public class ContentIntegrationPHPBB3_p {
|
||||||
|
|
||||||
|
public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch<?> env) {
|
||||||
|
final serverObjects prop = new serverObjects();
|
||||||
|
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
|
||||||
|
|
||||||
|
prop.put("check", 0);
|
||||||
|
prop.put("export", 0);
|
||||||
|
|
||||||
|
if (post != null) {
|
||||||
|
|
||||||
|
String urlstub = post.get("content.phpbb3.urlstub", "");
|
||||||
|
String dbtype = post.get("content.phpbb3.dbtype", "");
|
||||||
|
String dbhost = post.get("content.phpbb3.dbhost", "");
|
||||||
|
int dbport = post.getInt("content.phpbb3.dbport", 3306);
|
||||||
|
String dbname = post.get("content.phpbb3.dbname", "");
|
||||||
|
String dbuser = post.get("content.phpbb3.dbuser", "");
|
||||||
|
String dbpw = post.get("content.phpbb3.dbpw", "");
|
||||||
|
int ppf = post.getInt("content.phpbb3.ppf", 1000);
|
||||||
|
|
||||||
|
|
||||||
|
sb.setConfig("content.phpbb3.urlstub", urlstub);
|
||||||
|
sb.setConfig("content.phpbb3.dbtype", dbtype);
|
||||||
|
sb.setConfig("content.phpbb3.dbhost", dbhost);
|
||||||
|
sb.setConfig("content.phpbb3.dbport", dbport);
|
||||||
|
sb.setConfig("content.phpbb3.dbname", dbname);
|
||||||
|
sb.setConfig("content.phpbb3.dbuser", dbuser);
|
||||||
|
sb.setConfig("content.phpbb3.dbpw", dbpw);
|
||||||
|
sb.setConfig("content.phpbb3.ppf", ppf);
|
||||||
|
|
||||||
|
if (post.containsKey("check")) {
|
||||||
|
try {
|
||||||
|
Dao db = new PhpBB3Dao(
|
||||||
|
urlstub,
|
||||||
|
dbtype,
|
||||||
|
dbhost,
|
||||||
|
dbport,
|
||||||
|
dbname,
|
||||||
|
dbuser,
|
||||||
|
dbpw
|
||||||
|
);
|
||||||
|
prop.put("check", 1);
|
||||||
|
prop.put("check_posts", db.size());
|
||||||
|
prop.putHTML("check_first", db.first().toString());
|
||||||
|
prop.putHTML("check_last", db.latest().toString());
|
||||||
|
db.close();
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
prop.put("check", 2);
|
||||||
|
prop.put("check_error", e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (post.containsKey("export")) {
|
||||||
|
try {
|
||||||
|
Dao db = new PhpBB3Dao(
|
||||||
|
urlstub,
|
||||||
|
dbtype,
|
||||||
|
dbhost,
|
||||||
|
dbport,
|
||||||
|
dbname,
|
||||||
|
dbuser,
|
||||||
|
dbpw
|
||||||
|
);
|
||||||
|
|
||||||
|
int files = db.writeSurrogates(db.query(0, -1, 100), sb.surrogatesInPath, "fullexport-" + DateFormatter.formatShortSecond(), ppf);
|
||||||
|
prop.put("export", 1);
|
||||||
|
prop.put("export_files", files);
|
||||||
|
db.close();
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
prop.put("export", 2);
|
||||||
|
prop.put("export_error", e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
prop.putHTML("content.phpbb3.urlstub", sb.getConfig("content.phpbb3.urlstub", ""));
|
||||||
|
prop.putHTML("content.phpbb3.dbtype", sb.getConfig("content.phpbb3.dbtype", ""));
|
||||||
|
prop.putHTML("content.phpbb3.dbhost", sb.getConfig("content.phpbb3.dbhost", ""));
|
||||||
|
prop.putHTML("content.phpbb3.dbport", sb.getConfig("content.phpbb3.dbport", ""));
|
||||||
|
prop.putHTML("content.phpbb3.dbname", sb.getConfig("content.phpbb3.dbname", ""));
|
||||||
|
prop.putHTML("content.phpbb3.dbuser", sb.getConfig("content.phpbb3.dbuser", ""));
|
||||||
|
prop.putHTML("content.phpbb3.dbpw", sb.getConfig("content.phpbb3.dbpw", ""));
|
||||||
|
prop.putHTML("content.phpbb3.ppf", sb.getConfig("content.phpbb3.ppf", ""));
|
||||||
|
|
||||||
|
return prop;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,393 @@
|
|||||||
|
// PhpBB3Dao.java
|
||||||
|
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||||
|
// first published 26.05.2009 on http://yacy.net
|
||||||
|
//
|
||||||
|
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
|
||||||
|
// $LastChangedRevision: 1986 $
|
||||||
|
// $LastChangedBy: orbiter $
|
||||||
|
//
|
||||||
|
// LICENSE
|
||||||
|
//
|
||||||
|
// This program is free software; you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU General Public License as published by
|
||||||
|
// the Free Software Foundation; either version 2 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU General Public License
|
||||||
|
// along with this program; if not, write to the Free Software
|
||||||
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
|
||||||
|
package de.anomic.content.dao;
|
||||||
|
|
||||||
|
import java.io.BufferedOutputStream;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.ResultSet;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.sql.Statement;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.concurrent.ArrayBlockingQueue;
|
||||||
|
import java.util.concurrent.BlockingQueue;
|
||||||
|
|
||||||
|
import de.anomic.content.DCEntry;
|
||||||
|
import de.anomic.yacy.yacyURL;
|
||||||
|
|
||||||
|
public class PhpBB3Dao implements Dao {
|
||||||
|
|
||||||
|
private Connection conn = null;
|
||||||
|
private String urlstub;
|
||||||
|
private HashMap<Integer, String> users;
|
||||||
|
|
||||||
|
public PhpBB3Dao(
|
||||||
|
String urlstub,
|
||||||
|
String dbType,
|
||||||
|
String host,
|
||||||
|
int port,
|
||||||
|
String dbname,
|
||||||
|
String user,
|
||||||
|
String pw) throws Exception {
|
||||||
|
this.conn = getConnection(dbType, host, port, dbname, user, pw);
|
||||||
|
this.urlstub = urlstub;
|
||||||
|
this.users = new HashMap<Integer, String>();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void finalize() throws Throwable {
|
||||||
|
closeConnection();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Connection getConnection(final String dbType, String host, int port, String dbname, String user, String pw) throws Exception {
|
||||||
|
String dbDriverStr = null, dbConnStr = null;
|
||||||
|
if (dbType.equalsIgnoreCase("mysql")) {
|
||||||
|
dbDriverStr = "com.mysql.jdbc.Driver";
|
||||||
|
dbConnStr = "jdbc:mysql://" + host + ":" + port + "/" + dbname;
|
||||||
|
} else if (dbType.equalsIgnoreCase("pgsql")) {
|
||||||
|
dbDriverStr = "org.postgresql.Driver";
|
||||||
|
dbConnStr = "jdbc:postgresql://" + host + ":" + port + "/" + dbname;
|
||||||
|
} else throw new IllegalArgumentException();
|
||||||
|
|
||||||
|
try {
|
||||||
|
Class.forName(dbDriverStr).newInstance();
|
||||||
|
} catch (final Exception e) {
|
||||||
|
throw new Exception("Unable to load the jdbc driver: " + e.getMessage(),e);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return DriverManager.getConnection(dbConnStr, user, pw);
|
||||||
|
} catch (final Exception e) {
|
||||||
|
throw new Exception("Unable to establish a database connection: " + e.getMessage(),e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void closeConnection() {
|
||||||
|
if (conn != null) {
|
||||||
|
try {
|
||||||
|
conn.close();
|
||||||
|
conn = null;
|
||||||
|
} catch (SQLException e) {
|
||||||
|
System.out.println("PhpBB3Dao: " + e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Date first() {
|
||||||
|
StringBuilder sql = new StringBuilder(256);
|
||||||
|
sql.append("select min(post_time) from phpbb_posts");
|
||||||
|
Statement stmt = null;
|
||||||
|
ResultSet rs = null;
|
||||||
|
try {
|
||||||
|
stmt = conn.createStatement();
|
||||||
|
rs = stmt.executeQuery(sql.toString());
|
||||||
|
if (rs.next()) {
|
||||||
|
return new Date(rs.getLong(1) * 1000L);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
} catch (SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
return null;
|
||||||
|
} finally {
|
||||||
|
if (rs != null) try {rs.close();} catch (SQLException e) {}
|
||||||
|
if (stmt != null) try {stmt.close();} catch (SQLException e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Date latest() {
|
||||||
|
StringBuilder sql = new StringBuilder(256);
|
||||||
|
sql.append("select max(post_time) from phpbb_posts");
|
||||||
|
Statement stmt = null;
|
||||||
|
ResultSet rs = null;
|
||||||
|
try {
|
||||||
|
stmt = conn.createStatement();
|
||||||
|
rs = stmt.executeQuery(sql.toString());
|
||||||
|
if (rs.next()) {
|
||||||
|
return new Date(rs.getLong(1) * 1000L);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
} catch (SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
return null;
|
||||||
|
} finally {
|
||||||
|
if (rs != null) try {rs.close();} catch (SQLException e) {}
|
||||||
|
if (stmt != null) try {stmt.close();} catch (SQLException e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
StringBuilder sql = new StringBuilder(256);
|
||||||
|
sql.append("select count(*) from phpbb_posts");
|
||||||
|
Statement stmt = null;
|
||||||
|
ResultSet rs = null;
|
||||||
|
try {
|
||||||
|
stmt = conn.createStatement();
|
||||||
|
rs = stmt.executeQuery(sql.toString());
|
||||||
|
if (rs.next()) {
|
||||||
|
return rs.getInt(1);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
} catch (SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
return 0;
|
||||||
|
} finally {
|
||||||
|
if (rs != null) try {rs.close();} catch (SQLException e) {}
|
||||||
|
if (stmt != null) try {stmt.close();} catch (SQLException e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public DCEntry get(int item) {
|
||||||
|
StringBuilder sql = new StringBuilder(256);
|
||||||
|
sql.append("select * from phpbb_posts where post_id = ");
|
||||||
|
sql.append(item);
|
||||||
|
return getOne(sql);
|
||||||
|
}
|
||||||
|
|
||||||
|
public BlockingQueue<DCEntry> query(int from, int until, int queueSize) {
|
||||||
|
// define the sql query
|
||||||
|
final StringBuilder sql = new StringBuilder(256);
|
||||||
|
sql.append("select * from phpbb_posts where post_id >= ");
|
||||||
|
sql.append(from);
|
||||||
|
if (until > from) {
|
||||||
|
sql.append(" and post_id < ");
|
||||||
|
sql.append(until);
|
||||||
|
}
|
||||||
|
sql.append(" order by post_id");
|
||||||
|
|
||||||
|
// execute the query and push entries to a queue concurrently
|
||||||
|
return toQueue(sql, queueSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public BlockingQueue<DCEntry> query(Date from, int queueSize) {
|
||||||
|
// define the sql query
|
||||||
|
final StringBuilder sql = new StringBuilder(256);
|
||||||
|
sql.append("select * from phpbb_posts where post_time >= ");
|
||||||
|
sql.append(from.getTime() / 1000);
|
||||||
|
sql.append(" order by post_id");
|
||||||
|
|
||||||
|
// execute the query and push entries to a queue concurrently
|
||||||
|
return toQueue(sql, queueSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private DCEntry getOne(StringBuilder sql) {
|
||||||
|
Statement stmt = null;
|
||||||
|
ResultSet rs = null;
|
||||||
|
try {
|
||||||
|
stmt = conn.createStatement();
|
||||||
|
rs = stmt.executeQuery(sql.toString());
|
||||||
|
if (rs.next()) {
|
||||||
|
try {
|
||||||
|
return parseResultSet(rs);
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
} catch (SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
return null;
|
||||||
|
} finally {
|
||||||
|
if (rs != null) try {rs.close();} catch (SQLException e) {}
|
||||||
|
if (stmt != null) try {stmt.close();} catch (SQLException e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private BlockingQueue<DCEntry> toQueue(final StringBuilder sql, int queueSize) {
|
||||||
|
// execute the query and push entries to a queue concurrently
|
||||||
|
final BlockingQueue<DCEntry> queue = new ArrayBlockingQueue<DCEntry>(queueSize);
|
||||||
|
Thread dbreader = new Thread() {
|
||||||
|
public void run() {
|
||||||
|
Statement stmt = null;
|
||||||
|
ResultSet rs = null;
|
||||||
|
try {
|
||||||
|
stmt = conn.createStatement();
|
||||||
|
rs = stmt.executeQuery(sql.toString());
|
||||||
|
while (rs.next()) {
|
||||||
|
try {
|
||||||
|
queue.put(parseResultSet(rs));
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
queue.put(DCEntry.poison);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} finally {
|
||||||
|
if (rs != null) try {rs.close();} catch (SQLException e) {}
|
||||||
|
if (stmt != null) try {stmt.close();} catch (SQLException e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
dbreader.start();
|
||||||
|
return queue;
|
||||||
|
}
|
||||||
|
|
||||||
|
private DCEntry parseResultSet(ResultSet rs) throws SQLException, MalformedURLException {
|
||||||
|
yacyURL url;
|
||||||
|
int item = rs.getInt("post_id");
|
||||||
|
url = new yacyURL(this.urlstub + "/viewtopic.php?t=" + item);
|
||||||
|
String subject = rs.getString("post_subject");
|
||||||
|
String text = xmlCleaner(rs.getString("post_text"));
|
||||||
|
String user = getUser(rs.getInt("poster_id"));
|
||||||
|
Date date = new Date(rs.getLong("post_time") * 1000L);
|
||||||
|
return new DCEntry(url, date, subject, user, text);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String xmlCleaner(String s) {
|
||||||
|
if (s == null) return null;
|
||||||
|
|
||||||
|
StringBuilder sbOutput = new StringBuilder(s.length());
|
||||||
|
char c;
|
||||||
|
|
||||||
|
for (int i = 0; i < s.length(); i++ ) {
|
||||||
|
c = s.charAt(i);
|
||||||
|
if ((c >= 0x0020 && c <= 0xD7FF) ||
|
||||||
|
(c >= 0xE000 && c <= 0xFFFD) ||
|
||||||
|
c == 0x0009 ||
|
||||||
|
c == 0x000A ||
|
||||||
|
c == 0x000D ) {
|
||||||
|
sbOutput.append(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sbOutput.toString().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getUser(int poster_id) {
|
||||||
|
String nick = this.users.get(poster_id);
|
||||||
|
if (nick != null) return nick;
|
||||||
|
|
||||||
|
StringBuilder sql = new StringBuilder(256);
|
||||||
|
sql.append("select * from phpbb_users where user_id = ");
|
||||||
|
sql.append(poster_id);
|
||||||
|
Statement stmt = null;
|
||||||
|
ResultSet rs = null;
|
||||||
|
try {
|
||||||
|
stmt = conn.createStatement();
|
||||||
|
rs = stmt.executeQuery(sql.toString());
|
||||||
|
if (rs.next()) nick = rs.getString("username");
|
||||||
|
if (nick == null) nick = "";
|
||||||
|
this.users.put(poster_id, nick);
|
||||||
|
return nick;
|
||||||
|
} catch (SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
return "";
|
||||||
|
} finally {
|
||||||
|
if (rs != null) try {rs.close();} catch (SQLException e) {}
|
||||||
|
if (stmt != null) try {stmt.close();} catch (SQLException e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int writeSurrogates(
|
||||||
|
BlockingQueue<DCEntry> queue,
|
||||||
|
File targetdir,
|
||||||
|
String versioninfo,
|
||||||
|
int maxEntriesInFile
|
||||||
|
) {
|
||||||
|
try {
|
||||||
|
// generate output file name and attributes
|
||||||
|
String targethost = new yacyURL(this.urlstub, null).getHost();
|
||||||
|
int fc = 0;
|
||||||
|
File outputfiletmp = null, outputfile = null;
|
||||||
|
|
||||||
|
// write the result from the query concurrently in a file
|
||||||
|
OutputStreamWriter osw = null;
|
||||||
|
DCEntry e;
|
||||||
|
int c = 0;
|
||||||
|
while ((e = queue.take()) != DCEntry.poison) {
|
||||||
|
if (osw == null) {
|
||||||
|
outputfiletmp = new File(targetdir, targethost + "." + versioninfo + "." + fc + ".xml.tmp");
|
||||||
|
outputfile = new File(targetdir, targethost + "." + versioninfo + "." + fc + ".xml");
|
||||||
|
if (outputfiletmp.exists()) outputfiletmp.delete();
|
||||||
|
if (outputfile.exists()) outputfile.delete();
|
||||||
|
osw = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outputfiletmp)), "UTF-8");
|
||||||
|
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<surrogates xmlns:dc=\"http://purl.org/dc/elements/1.1/\">\n");
|
||||||
|
}
|
||||||
|
e.writeXML(osw);
|
||||||
|
c++;
|
||||||
|
if (c >= maxEntriesInFile) {
|
||||||
|
osw.write("</surrogates>\n");
|
||||||
|
osw.close();
|
||||||
|
outputfiletmp.renameTo(outputfile);
|
||||||
|
osw = null;
|
||||||
|
c = 0;
|
||||||
|
fc++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
osw.write("</surrogates>\n");
|
||||||
|
osw.close();
|
||||||
|
outputfiletmp.renameTo(outputfile);
|
||||||
|
return fc + 1;
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (UnsupportedEncodingException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
try {
|
||||||
|
this.conn.close();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
PhpBB3Dao db;
|
||||||
|
try {
|
||||||
|
db = new PhpBB3Dao(
|
||||||
|
"http://forum.yacy-websuche.de",
|
||||||
|
"mysql",
|
||||||
|
"localhost",
|
||||||
|
3306,
|
||||||
|
"forum",
|
||||||
|
"root",
|
||||||
|
""
|
||||||
|
);
|
||||||
|
System.out.println("Posts in database : " + db.size());
|
||||||
|
System.out.println("First entry : " + db.first());
|
||||||
|
System.out.println("Last entry : " + db.latest());
|
||||||
|
File targetdir = new File("x").getParentFile();
|
||||||
|
db.writeSurrogates(db.query(0, -1, 100), targetdir, "id0-current", 3000);
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in new issue