From 9bdee5c71c0057d095818dddb0cd15fcf6953399 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 25 Aug 2011 16:49:20 +0000 Subject: [PATCH] added a servlet that produces a list of term hashes that appear more than 10000 times see /api/termlist_p.xml git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7898 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/api/termlist_p.java | 74 ++++++++++++++++++++++++++++++++++++++ htroot/api/termlist_p.xml | 11 ++++++ 2 files changed, 85 insertions(+) create mode 100644 htroot/api/termlist_p.java create mode 100644 htroot/api/termlist_p.xml diff --git a/htroot/api/termlist_p.java b/htroot/api/termlist_p.java new file mode 100644 index 000000000..0345b9c3d --- /dev/null +++ b/htroot/api/termlist_p.java @@ -0,0 +1,74 @@ +// rwilist_p +// ------------ +// (C) 2011 by Michael Peter Christen; mc@yacy.net +// first published 25.08.2011 on http://yacy.net +// +// $LastChangedDate: 2011-01-03 21:52:54 +0100 (Mo, 03 Jan 2011) $ +// $LastChangedRevision: 7420 $ +// $LastChangedBy: orbiter $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import java.util.Iterator; + +import net.yacy.cora.document.ASCII; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.ranking.Rating; +import de.anomic.search.Segment; +import de.anomic.search.Segments; +import de.anomic.search.Switchboard; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class termlist_p { + + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + + final serverObjects prop = new serverObjects(); + final Switchboard sb = (Switchboard) env; + Segment segment = null; + final boolean html = post != null && post.containsKey("html"); + final long mincount = post == null ? 10000 : post.getLong("mincount", 10000); + if (post != null && post.containsKey("segment") && sb.verifyAuthentication(header, false)) { + segment = sb.indexSegments.segment(post.get("segment")); + } + if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC); + final Iterator> i = segment.termIndex().referenceCountIterator(null, false); + Rating e; + int c = 0; + byte[] termhash, maxterm = null; + long count, maxcount = 0; + while (i.hasNext()) { + e = i.next(); + count = e.getScore(); + if (count > maxcount) { + maxcount = count; + maxterm = e.getObject(); + } + if (count < mincount) continue; + termhash = e.getObject(); + prop.put("terms_" + c + "_termhash", ASCII.String(termhash)); + prop.put("terms_" + c + "_count", count); + c++; + } + prop.put("terms", c); + prop.put("maxterm", maxterm == null ? "" : ASCII.String(maxterm)); + prop.put("maxcount", maxcount); + + // return rewrite properties + return prop; + } + +} diff --git a/htroot/api/termlist_p.xml b/htroot/api/termlist_p.xml new file mode 100644 index 000000000..46545105e --- /dev/null +++ b/htroot/api/termlist_p.xml @@ -0,0 +1,11 @@ + + + + #[maxcount]# + +#{terms}# + + #[count]# + +#{/terms}# + \ No newline at end of file