|
|
|
/**
|
|
|
|
* snapshot
|
|
|
|
* Copyright 2014 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
|
|
|
|
* First released 02.12.2014 at http://yacy.net
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
|
|
* along with this program in the file lgpl21.txt
|
|
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
import java.awt.Container;
|
|
|
|
import java.awt.Image;
|
|
|
|
import java.awt.MediaTracker;
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
|
|
import java.io.File;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.net.MalformedURLException;
|
|
|
|
import java.util.Collection;
|
|
|
|
import java.util.Date;
|
|
|
|
import java.util.Map;
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
import java.util.TreeMap;
|
|
|
|
|
|
|
|
import org.apache.solr.common.SolrDocument;
|
|
|
|
import org.apache.solr.common.SolrInputDocument;
|
|
|
|
|
|
|
|
import net.yacy.cora.document.encoding.ASCII;
|
|
|
|
import net.yacy.cora.document.encoding.UTF8;
|
|
|
|
import net.yacy.cora.document.feed.RSSFeed;
|
|
|
|
import net.yacy.cora.document.feed.RSSMessage;
|
|
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
|
|
import net.yacy.cora.protocol.RequestHeader;
|
|
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
|
|
import net.yacy.cora.util.Html2Image;
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
import net.yacy.cora.util.JSONException;
|
|
|
|
import net.yacy.cora.util.JSONObject;
|
|
|
|
import net.yacy.crawler.data.Snapshots;
|
|
|
|
import net.yacy.crawler.data.Transactions;
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
import net.yacy.crawler.data.Snapshots.Revisions;
|
|
|
|
import net.yacy.document.ImageParser;
|
|
|
|
import net.yacy.kelondro.util.FileUtils;
|
|
|
|
import net.yacy.peers.graphics.EncodedImage;
|
|
|
|
import net.yacy.search.Switchboard;
|
|
|
|
import net.yacy.search.SwitchboardConstants;
|
|
|
|
import net.yacy.server.serverObjects;
|
|
|
|
import net.yacy.server.serverSwitch;
|
|
|
|
|
|
|
|
public class snapshot {
|
|
|
|
|
|
|
|
//width = 1024, height = 1024, density = 300, quality = 75
|
|
|
|
private final static int DEFAULT_WIDTH = 1024;
|
|
|
|
private final static int DEFAULT_HEIGHT = 1024;
|
|
|
|
private final static int DEFAULT_DENSITY = 300;
|
|
|
|
private final static int DEFAULT_QUALITY = 75;
|
|
|
|
private final static String DEFAULT_EXT = "jpg";
|
|
|
|
|
|
|
|
public static Object respond(final RequestHeader header, serverObjects post, final serverSwitch env) {
|
|
|
|
final Switchboard sb = (Switchboard) env;
|
|
|
|
|
|
|
|
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
|
|
|
|
final String ext = header.get("EXT", "");
|
|
|
|
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
|
|
|
|
if (ext.equals("rss")) {
|
|
|
|
// create a report about the content of the snapshot directory
|
|
|
|
if (!authenticated) return null;
|
|
|
|
int maxcount = post == null ? 10 : post.getInt("maxcount", 10);
|
|
|
|
int depthx = post == null ? -1 : post.getInt("depth", -1);
|
|
|
|
Integer depth = depthx == -1 ? null : depthx;
|
|
|
|
String orderx = post == null ? "ANY" : post.get("order", "ANY");
|
|
|
|
Snapshots.Order order = Snapshots.Order.valueOf(orderx);
|
|
|
|
String statex = post == null ? Transactions.State.INVENTORY.name() : post.get("state", Transactions.State.INVENTORY.name());
|
|
|
|
Transactions.State state = Transactions.State.valueOf(statex);
|
|
|
|
String host = post == null ? null : post.get("host");
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
Map<String, Revisions> iddate = Transactions.select(host, depth, order, maxcount, state);
|
|
|
|
// now select the URL from the index for these ids in iddate and make an RSS feed
|
|
|
|
RSSFeed rssfeed = new RSSFeed(Integer.MAX_VALUE);
|
|
|
|
rssfeed.setChannel(new RSSMessage("Snapshot list for host = " + host + ", depth = " + depth + ", order = " + order + ", maxcount = " + maxcount, "", ""));
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
for (Map.Entry<String, Revisions> e: iddate.entrySet()) {
|
|
|
|
try {
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
DigestURL u = e.getValue().url == null ? sb.index.fulltext().getURL(e.getKey()) : new DigestURL(e.getValue().url);
|
|
|
|
if (u == null) continue;
|
|
|
|
RSSMessage message = new RSSMessage(u.toNormalform(true), "", u, e.getKey());
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
message.setPubDate(e.getValue().dates[0]);
|
|
|
|
rssfeed.addMessage(message);
|
|
|
|
} catch (IOException ee) {
|
|
|
|
ConcurrentLog.logException(ee);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
byte[] rssBinary = UTF8.getBytes(rssfeed.toString());
|
|
|
|
return new ByteArrayInputStream(rssBinary);
|
|
|
|
}
|
|
|
|
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
// for the following methods we (mostly) need an url or a url hash
|
|
|
|
if (post == null) post = new serverObjects();
|
|
|
|
final boolean xml = ext.equals("xml");
|
|
|
|
final boolean pdf = ext.equals("pdf");
|
|
|
|
if (pdf && !authenticated) return null;
|
|
|
|
final boolean pngjpg = ext.equals("png") || ext.equals("jpg");
|
|
|
|
String urlhash = post.get("urlhash", "");
|
|
|
|
String url = post.get("url", "");
|
|
|
|
DigestURL durl = null;
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
if (urlhash.length() == 0 && url.length() > 0) {
|
|
|
|
try {
|
|
|
|
durl = new DigestURL(url);
|
|
|
|
urlhash = ASCII.String(durl.hash());
|
|
|
|
} catch (MalformedURLException e) {
|
|
|
|
}
|
|
|
|
}
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
if (durl == null && urlhash.length() > 0) {
|
|
|
|
try {
|
|
|
|
durl = sb.index.fulltext().getURL(urlhash);
|
|
|
|
} catch (IOException e) {
|
|
|
|
ConcurrentLog.logException(e);
|
|
|
|
}
|
|
|
|
}
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
if (url.length() == 0 && durl != null) url = durl.toNormalform(true);
|
|
|
|
|
|
|
|
if (ext.equals("json")) {
|
|
|
|
// command interface: view and change a transaction state, get metadata about transactions in the past
|
|
|
|
String command = post.get("command", "metadata");
|
|
|
|
String statename = post.get("state");
|
|
|
|
JSONObject result = new JSONObject();
|
|
|
|
try {
|
|
|
|
if (command.equals("status")) {
|
|
|
|
// return a status of the transaction archive
|
|
|
|
JSONObject sizes = new JSONObject();
|
|
|
|
for (Map.Entry<String, Integer> state: Transactions.sizes().entrySet()) sizes.put(state.getKey(), state.getValue());
|
|
|
|
result.put("size", sizes);
|
|
|
|
} else if (command.equals("list")) {
|
|
|
|
if (!authenticated) return null;
|
|
|
|
// return a status of the transaction archive
|
|
|
|
String host = post.get("host");
|
|
|
|
String depth = post.get("depth");
|
|
|
|
int depthi = depth == null ? -1 : Integer.parseInt(depth);
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
for (Transactions.State state: statename == null ?
|
|
|
|
new Transactions.State[]{Transactions.State.INVENTORY, Transactions.State.ARCHIVE} :
|
|
|
|
new Transactions.State[]{Transactions.State.valueOf(statename)}) {
|
|
|
|
if (host == null) {
|
|
|
|
JSONObject hostCountInventory = new JSONObject();
|
|
|
|
for (String h: Transactions.listHosts(state)) {
|
|
|
|
int size = Transactions.listIDsSize(h, depthi, state);
|
|
|
|
if (size > 0) hostCountInventory.put(h, size);
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
}
|
|
|
|
result.put("count." + state.name(), hostCountInventory);
|
|
|
|
} else {
|
|
|
|
TreeMap<Integer, Collection<Revisions>> ids = Transactions.listIDs(host, depthi, state);
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
if (ids == null) {
|
|
|
|
result.put("result", "fail");
|
|
|
|
result.put("comment", "no entries for host " + host + " found");
|
Added a transaction interface to the snapshots: all documents in the
snapshots can now be processed with transactions using commit and
rollback commands. Furthermore, a large number of monitoring methods had
been added to check the success of transactions.
The transactions for snapshots have two main components: a rss search
API to get information about latest/oldest entries and a commit/rollback
API to move entries away from the rss results. This is done by usage of
two storage locations for the snapshots, INVENTORY and ARCHIVE. New
snapshots are placed to INVENTORY, commited snapshots move to ARCHIVE,
rollback snapshots move to INVENTORY again.
Normal Workflow:
Beside all these options below, usually it is sufficient to process data
like this:
- call
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
- process the rss result and use the <guid> value as <urlhash> (see next
command)
- for each processed result call
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
- then you can call the rss feed again and the commited urls are omited
from the next set of items.
These are the commands to control this:
The rss feed:
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=OLDESTFIRST
http://localhost:8090/api/snapshot.rss?state=ARCHIVE&order=LATESTFIRST
The feed will return a <urlhash> in the <guid> - field of the rss. This
must be used for commit/rollback:
Commit/Rollback:
http://localhost:8090/api/snapshot.json?command=commit&urlhash=<urlhash>
http://localhost:8090/api/snapshot.json?command=rollback&urlhash=<urlhash>
The json will return a property list containing the property "result"
with possible values "success" or "fail", according of the result. If an
"fail" occurs, please look into the log for further info.
Monitoring:
http://localhost:8090/api/snapshot.json?command=status
This shows the total number of entries in the INVENTORY and the ARCHIVE
http://localhost:8090/api/snapshot.json?command=list
This will result a list of all hosts which have snapshots and the number
of entries for the hosts. Counts for INVENTORY and ARCHIVE are listed in
the porperties for "count.INVENTORY" and "count.ARCHIVE"
http://localhost:8090/api/snapshot.json?command=list&depth=2
The list can be restricted to such which have a specific depth. The list
contains then the same host names, but the count values change because
only documents at that specific crawl depth are listed
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80
This lists all urlhashes for the given host, not only an accumulated
list of the number of entries
http://localhost:8090/api/snapshot.json?command=list&host=yacy.net.80&depth=0
This restricts the list of urlhashes for that host for the given depth
http://localhost:8090/api/snapshot.json?command=list&state=INVENTORY
http://localhost:8090/api/snapshot.json?command=list&state=ARCHIVE
This selects either the INVENTORY or ARCHIVE for all list commands,
default is ALL which means that from both snapshot directories the host
information is collected and combined. You can use the state option for
all the commands as listed above
Detailed Information:
http://localhost:8090/api/snapshot.json?command=metadata&urlhash=upiFJ7Fh1hyQ
This collects metadata information for the given urlhash. This can also
be restricted with state=INVENTORY and state=ARCHIVE to test if the
document is either in one of these snapshot directories. If an urlhash
is not found, an empty result is returned. If an entry was found and the
state was not restricted, then the result contains a state property
containing the name of the location where the document is, either
INVENTORY or ARCHIVE.
Hint:
If a very large number of documents is inside of INVENTORY, then it
could be better to call the rss feed with
http://localhost:8090/api/snapshot.rss?state=INVENTORY&order=ANY
because that is very efficient.
10 years ago
|
|
|
} else {
|
|
|
|
for (Map.Entry<Integer, Collection<Revisions>> entry: ids.entrySet()) {
|
|
|
|
for (Revisions r: entry.getValue()) {
|
|
|
|
try {
|
|
|
|
JSONObject metadata = new JSONObject();
|
|
|
|
DigestURL u = r.url != null ? new DigestURL(r.url) : sb.index.fulltext().getURL(r.urlhash);
|
|
|
|
metadata.put("url", u == null ? "unknown" : u.toNormalform(true));
|
|
|
|
metadata.put("dates", r.dates);
|
|
|
|
assert r.depth == entry.getKey().intValue();
|
|
|
|
metadata.put("depth", entry.getKey().intValue());
|
|
|
|
result.put(r.urlhash, metadata);
|
|
|
|
} catch (IOException e) {}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (command.equals("commit")) {
|
|
|
|
if (!authenticated) return null;
|
|
|
|
Revisions r = Transactions.commit(urlhash);
|
|
|
|
if (r != null) {
|
|
|
|
result.put("result", "success");
|
|
|
|
result.put("depth", r.depth);
|
|
|
|
result.put("url", r.url);
|
|
|
|
result.put("dates", r.dates);
|
|
|
|
} else {
|
|
|
|
result.put("result", "fail");
|
|
|
|
}
|
|
|
|
result.put("urlhash", urlhash);
|
|
|
|
} else if (command.equals("rollback")) {
|
|
|
|
if (!authenticated) return null;
|
|
|
|
Revisions r = Transactions.rollback(urlhash);
|
|
|
|
if (r != null) {
|
|
|
|
result.put("result", "success");
|
|
|
|
result.put("depth", r.depth);
|
|
|
|
result.put("url", r.url);
|
|
|
|
result.put("dates", r.dates);
|
|
|
|
} else {
|
|
|
|
result.put("result", "fail");
|
|
|
|
}
|
|
|
|
result.put("urlhash", urlhash);
|
|
|
|
} else if (command.equals("metadata")) {
|
|
|
|
try {
|
|
|
|
Revisions r;
|
|
|
|
Transactions.State state = statename == null || statename.length() == 0 ? null : Transactions.State.valueOf(statename);
|
|
|
|
if (state == null) {
|
|
|
|
r = Transactions.getRevisions(Transactions.State.INVENTORY, urlhash);
|
|
|
|
if (r != null) state = Transactions.State.INVENTORY;
|
|
|
|
r = Transactions.getRevisions(Transactions.State.ARCHIVE, urlhash);
|
|
|
|
if (r != null) state = Transactions.State.ARCHIVE;
|
|
|
|
} else {
|
|
|
|
r = Transactions.getRevisions(state, urlhash);
|
|
|
|
}
|
|
|
|
if (r != null) {
|
|
|
|
JSONObject metadata = new JSONObject();
|
|
|
|
DigestURL u;
|
|
|
|
u = r.url != null ? new DigestURL(r.url) : sb.index.fulltext().getURL(r.urlhash);
|
|
|
|
metadata.put("url", u == null ? "unknown" : u.toNormalform(true));
|
|
|
|
metadata.put("dates", r.dates);
|
|
|
|
metadata.put("depth", r.depth);
|
|
|
|
metadata.put("state", state.name());
|
|
|
|
result.put(r.urlhash, metadata);
|
|
|
|
}
|
|
|
|
} catch (IOException |IllegalArgumentException e) {}
|
|
|
|
}
|
|
|
|
} catch (JSONException e) {
|
|
|
|
ConcurrentLog.logException(e);
|
|
|
|
}
|
|
|
|
String json = result.toString();
|
|
|
|
if (post.containsKey("callback")) json = post.get("callback") + "([" + json + "]);";
|
|
|
|
return new ByteArrayInputStream(UTF8.getBytes(json));
|
|
|
|
}
|
|
|
|
|
|
|
|
// for the following methods we always need the durl to fetch data
|
|
|
|
if (durl == null) return null;
|
|
|
|
|
|
|
|
if (xml) {
|
|
|
|
Collection<File> xmlSnapshots = Transactions.findPaths(durl, "xml", Transactions.State.ANY);
|
|
|
|
File xmlFile = null;
|
|
|
|
if (xmlSnapshots.size() == 0) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
xmlFile = xmlSnapshots.iterator().next();
|
|
|
|
try {
|
|
|
|
byte[] xmlBinary = FileUtils.read(xmlFile);
|
|
|
|
return new ByteArrayInputStream(xmlBinary);
|
|
|
|
} catch (IOException e) {
|
|
|
|
ConcurrentLog.logException(e);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pdf || pngjpg) {
|
|
|
|
Collection<File> pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.INVENTORY);
|
|
|
|
File pdfFile = null;
|
|
|
|
if (pdfSnapshots.size() == 0) {
|
|
|
|
// if the client is authenticated, we create the pdf on the fly!
|
|
|
|
if (!authenticated) return null;
|
|
|
|
SolrDocument sd = sb.index.fulltext().getMetadata(durl.hash());
|
|
|
|
boolean success = false;
|
|
|
|
if (sd == null) {
|
|
|
|
success = Transactions.store(durl, new Date(), 99, false, true, sb.getConfigBool(SwitchboardConstants.PROXY_TRANSPARENT_PROXY, false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, sb.getConfig("crawler.http.acceptLanguage", null));
|
|
|
|
} else {
|
|
|
|
SolrInputDocument sid = sb.index.fulltext().getDefaultConfiguration().toSolrInputDocument(sd);
|
|
|
|
success = Transactions.store(sid, false, true, true, sb.getConfigBool(SwitchboardConstants.PROXY_TRANSPARENT_PROXY, false) ? "http://127.0.0.1:" + sb.getConfigInt("port", 8090) : null, sb.getConfig("crawler.http.acceptLanguage", null));
|
|
|
|
}
|
|
|
|
if (success) {
|
|
|
|
pdfSnapshots = Transactions.findPaths(durl, "pdf", Transactions.State.ANY);
|
|
|
|
if (pdfSnapshots.size() != 0) pdfFile = pdfSnapshots.iterator().next();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
pdfFile = pdfSnapshots.iterator().next();
|
|
|
|
}
|
|
|
|
if (pdfFile == null) return null;
|
|
|
|
if (pdf) {
|
|
|
|
try {
|
|
|
|
byte[] pdfBinary = FileUtils.read(pdfFile);
|
|
|
|
return new ByteArrayInputStream(pdfBinary);
|
|
|
|
} catch (IOException e) {
|
|
|
|
ConcurrentLog.logException(e);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pngjpg) {
|
|
|
|
int width = Math.min(post.getInt("width", DEFAULT_WIDTH), DEFAULT_WIDTH);
|
|
|
|
int height = Math.min(post.getInt("height", DEFAULT_HEIGHT), DEFAULT_HEIGHT);
|
|
|
|
String imageFileStub = pdfFile.getAbsolutePath(); imageFileStub = imageFileStub.substring(0, imageFileStub.length() - 3); // cut off extension
|
|
|
|
File imageFile = new File(imageFileStub + DEFAULT_WIDTH + "." + DEFAULT_HEIGHT + "." + DEFAULT_EXT);
|
|
|
|
if (!imageFile.exists() && authenticated) {
|
|
|
|
Html2Image.pdf2image(pdfFile, imageFile, DEFAULT_WIDTH, DEFAULT_HEIGHT, DEFAULT_DENSITY, DEFAULT_QUALITY);
|
|
|
|
}
|
|
|
|
if (!imageFile.exists()) return null;
|
|
|
|
if (width == DEFAULT_WIDTH && height == DEFAULT_HEIGHT) {
|
|
|
|
try {
|
|
|
|
byte[] imageBinary = FileUtils.read(imageFile);
|
|
|
|
return new ByteArrayInputStream(imageBinary);
|
|
|
|
} catch (IOException e) {
|
|
|
|
ConcurrentLog.logException(e);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// lets read the file and scale
|
|
|
|
Image image;
|
|
|
|
try {
|
|
|
|
image = ImageParser.parse(imageFile.getAbsolutePath(), FileUtils.read(imageFile));
|
|
|
|
final Image scaled = image.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
|
|
|
|
final MediaTracker mediaTracker = new MediaTracker(new Container());
|
|
|
|
mediaTracker.addImage(scaled, 0);
|
|
|
|
try {mediaTracker.waitForID(0);} catch (final InterruptedException e) {}
|
|
|
|
return new EncodedImage(scaled, ext, true);
|
|
|
|
} catch (IOException e) {
|
|
|
|
ConcurrentLog.logException(e);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|