diff --git a/.classpath b/.classpath index 45b1ff7f1..e6ba6ace3 100644 --- a/.classpath +++ b/.classpath @@ -12,7 +12,6 @@ - @@ -30,9 +29,6 @@ - - - @@ -45,8 +41,24 @@ - + + + + + + + + + + + + + + + + + @@ -55,5 +67,6 @@ + diff --git a/addon/YaCy.app/Contents/Info.plist b/addon/YaCy.app/Contents/Info.plist index cbf9172d8..b8d5c6ce8 100644 --- a/addon/YaCy.app/Contents/Info.plist +++ b/addon/YaCy.app/Contents/Info.plist @@ -37,8 +37,10 @@ ClassPath $JAVAROOT/htroot + $JAVAROOT/lib/J7Zip-modified.jar $JAVAROOT/lib/activation.jar $JAVAROOT/lib/apache-mime4j-0.6.jar + $JAVAROOT/lib/apache-solr-core-3.6.0.jar $JAVAROOT/lib/apache-solr-solrj-3.6.0.jar $JAVAROOT/lib/arq-2.8.7.jar $JAVAROOT/lib/bcmail-jdk15-145.jar @@ -46,11 +48,14 @@ $JAVAROOT/lib/commons-codec-1.6.jar $JAVAROOT/lib/commons-compress-1.4.1.jar $JAVAROOT/lib/commons-fileupload-1.2.2.jar + $JAVAROOT/lib/commons-httpclient-3.1.jar $JAVAROOT/lib/commons-io-2.1.jar $JAVAROOT/lib/commons-jxpath-1.3.jar + $JAVAROOT/lib/commons-lang-2.6.jar $JAVAROOT/lib/commons-logging-1.1.1.jar - $JAVAROOT/lib/fontbox-1.6.0.jar + $JAVAROOT/lib/fontbox-1.7.0.jar $JAVAROOT/lib/geronimo-stax-api_1.0_spec-1.0.1.jar + $JAVAROOT/lib/guava-r05.jar $JAVAROOT/lib/htmllexer.jar $JAVAROOT/lib/htmlparser.jar $JAVAROOT/lib/httpclient-4.2.jar @@ -58,22 +63,30 @@ $JAVAROOT/lib/httpmime-4.2.jar $JAVAROOT/lib/icu4j-core.jar $JAVAROOT/lib/iri-0.8.jar - $JAVAROOT/lib/J7Zip-modified.jar $JAVAROOT/lib/jakarta-oro-2.0.8.jar $JAVAROOT/lib/jcifs-1.3.15.jar $JAVAROOT/lib/jcl-over-slf4j-1.6.1.jar - $JAVAROOT/lib/jempbox-1.6.0.jar + $JAVAROOT/lib/jempbox-1.7.0.jar $JAVAROOT/lib/jena-2.6.4.jar + $JAVAROOT/lib/jetty-6.1.26-patched-JETTY-1340.jar + $JAVAROOT/lib/jetty-util-6.1.26-patched-JETTY-1340.jar $JAVAROOT/lib/jsch-0.1.42.jar $JAVAROOT/lib/json-simple-1.1.jar $JAVAROOT/lib/log4j-1.2.16.jar + $JAVAROOT/lib/log4j-over-slf4j-1.6.1.jar + $JAVAROOT/lib/lucene-analyzers-3.6.0.jar + $JAVAROOT/lib/lucene-core-3.6.0.jar + $JAVAROOT/lib/lucene-highlighter-3.6.0.jar + $JAVAROOT/lib/lucene-phonetic-3.6.0.jar + $JAVAROOT/lib/lucene-spatial-3.6.0.jar + $JAVAROOT/lib/lucene-spellchecker-3.6.0.jar $JAVAROOT/lib/metadata-extractor-2.4.0-beta-1.jar $JAVAROOT/lib/mysql-connector-java-5.1.12-bin.jar - $JAVAROOT/lib/pdfbox-1.6.0.jar + $JAVAROOT/lib/pdfbox-1.7.0.jar $JAVAROOT/lib/poi-3.6-20091214.jar $JAVAROOT/lib/poi-scratchpad-3.6-20091214.jar $JAVAROOT/lib/sax-2.0.1.jar - $JAVAROOT/lib/servlet-api.jar + $JAVAROOT/lib/servlet-api-2.5-20081211.jar $JAVAROOT/lib/slf4j-api-1.6.1.jar $JAVAROOT/lib/slf4j-jdk14-1.6.1.jar $JAVAROOT/lib/webcat-0.1-swf.jar diff --git a/bin/checkalive.sh b/bin/checkalive.sh index 56d836c00..672fc3c28 100755 --- a/bin/checkalive.sh +++ b/bin/checkalive.sh @@ -1,7 +1,9 @@ #!/bin/bash -# add in /etc/crontab -# 0 * * * * yacy cd /home/yacy/production/bin && ./checkalive.sh +# for a production environment with high-availability requirement, +# (and if you are using the debian version of yacy) +# add the following line in /etc/crontab +# 0 * * * * root cd /usr/share/yacy/bin && ./checkalive.sh port=$(grep ^port= ../DATA/SETTINGS/yacy.conf |cut -d= -f2) RESULT=`wget -t 1 --spider http://localhost:$port/Status.html 2>&1` diff --git a/build.xml b/build.xml index fc5c83f80..12bdba82e 100644 --- a/build.xml +++ b/build.xml @@ -58,6 +58,7 @@ + @@ -155,8 +156,10 @@ + + @@ -164,11 +167,14 @@ + + - + + @@ -176,22 +182,30 @@ - - + + + + + + + + + + - + - + @@ -199,7 +213,7 @@ - + @@ -246,14 +260,6 @@ - - - - - - - - - - - + + + diff --git a/RDFaParser/RDFaParser.xsl b/defaults/RDFaParser.xsl similarity index 100% rename from RDFaParser/RDFaParser.xsl rename to defaults/RDFaParser.xsl diff --git a/defaults/solr/currency.xml b/defaults/solr/currency.xml new file mode 100644 index 000000000..3a9c58afe --- /dev/null +++ b/defaults/solr/currency.xml @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/defaults/solr/elevate.xml b/defaults/solr/elevate.xml new file mode 100644 index 000000000..25d5cebe4 --- /dev/null +++ b/defaults/solr/elevate.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + diff --git a/defaults/solr/lang/contractions_ca.txt b/defaults/solr/lang/contractions_ca.txt new file mode 100644 index 000000000..307a85f91 --- /dev/null +++ b/defaults/solr/lang/contractions_ca.txt @@ -0,0 +1,8 @@ +# Set of Catalan contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +l +m +n +s +t diff --git a/defaults/solr/lang/contractions_fr.txt b/defaults/solr/lang/contractions_fr.txt new file mode 100644 index 000000000..722db5883 --- /dev/null +++ b/defaults/solr/lang/contractions_fr.txt @@ -0,0 +1,9 @@ +# Set of French contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +l +m +t +qu +n +s +j diff --git a/defaults/solr/lang/contractions_ga.txt b/defaults/solr/lang/contractions_ga.txt new file mode 100644 index 000000000..9ebe7fa34 --- /dev/null +++ b/defaults/solr/lang/contractions_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +m +b diff --git a/defaults/solr/lang/contractions_it.txt b/defaults/solr/lang/contractions_it.txt new file mode 100644 index 000000000..cac040953 --- /dev/null +++ b/defaults/solr/lang/contractions_it.txt @@ -0,0 +1,23 @@ +# Set of Italian contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d diff --git a/defaults/solr/lang/hyphenations_ga.txt b/defaults/solr/lang/hyphenations_ga.txt new file mode 100644 index 000000000..4d2642cc5 --- /dev/null +++ b/defaults/solr/lang/hyphenations_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish hyphenations for StopFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +h +n +t diff --git a/defaults/solr/lang/stemdict_nl.txt b/defaults/solr/lang/stemdict_nl.txt new file mode 100644 index 000000000..441072971 --- /dev/null +++ b/defaults/solr/lang/stemdict_nl.txt @@ -0,0 +1,6 @@ +# Set of overrides for the dutch stemmer +# TODO: load this as a resource from the analyzer and sync it in build.xml +fiets fiets +bromfiets bromfiets +ei eier +kind kinder diff --git a/defaults/solr/lang/stoptags_ja.txt b/defaults/solr/lang/stoptags_ja.txt new file mode 100644 index 000000000..71b750845 --- /dev/null +++ b/defaults/solr/lang/stoptags_ja.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#名詞 +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#名詞-一般 +# +# noun-proper: Proper nouns where the sub-classification is undefined +#名詞-固有名詞 +# +# noun-proper-misc: miscellaneous proper nouns +#名詞-固有名詞-一般 +# +# noun-proper-person: Personal names where the sub-classification is undefined +#名詞-固有名詞-人名 +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. お市の方 +#名詞-固有名詞-人名-一般 +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. 山田 +#名詞-固有名詞-人名-姓 +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太郎 +#名詞-固有名詞-人名-名 +# +# noun-proper-organization: Names representing organizations. +# e.g. 通産省, NHK +#名詞-固有名詞-組織 +# +# noun-proper-place: Place names where the sub-classification is undefined +#名詞-固有名詞-地域 +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. アジア, バルセロナ, 京都 +#名詞-固有名詞-地域-一般 +# +# noun-proper-place-country: Country names. +# e.g. 日本, オーストラリア +#名詞-固有名詞-地域-国 +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#名詞-代名詞 +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ +#名詞-代名詞-一般 +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ +#名詞-代名詞-縮約 +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. 金曜, 一月, 午後, 少量 +#名詞-副詞可能 +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (する, できる, なさる, くださる) +# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り +#名詞-サ変接続 +# +# noun-adjective-base: The base form of adjectives, words that appear before な ("na") +# e.g. 健康, 安易, 駄目, だめ +#名詞-形容動詞語幹 +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. +# e.g. 0, 1, 2, 何, 数, 幾 +#名詞-数 +# +# noun-affix: noun affixes where the sub-classification is undefined +#名詞-非自立 +# +# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, +# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, +# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, +# わり, 割り, 割, ん-口語/, もん-口語/ +#名詞-非自立-一般 +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, +# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, +# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, +# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, +# 儘, 侭, みぎり, 矢先 +#名詞-非自立-副詞可能 +# +# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars +# with the stem よう(だ) ("you(da)"). +# e.g. よう, やう, 様 (よう) +#名詞-非自立-助動詞語幹 +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form な (aux "da"). +# e.g. みたい, ふう +#名詞-非自立-形容動詞語幹 +# +# noun-special: special nouns where the sub-classification is undefined. +#名詞-特殊 +# +# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is +# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. そう +#名詞-特殊-助動詞語幹 +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#名詞-接尾 +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ガル or タイ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# 接尾語 ("suffix") and is usually the last element in a compound noun. +# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, +# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 +#名詞-接尾-一般 +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. 君, 様, 著 +#名詞-接尾-人名 +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. 町, 市, 県 +#名詞-接尾-地域 +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before スル ("suru"). +# e.g. 化, 視, 分け, 入り, 落ち, 買い +#名詞-接尾-サ変接続 +# +# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, +# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. そう +#名詞-接尾-助動詞語幹 +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula だ ("da"). +# e.g. 的, げ, がち +#名詞-接尾-形容動詞語幹 +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) +#名詞-接尾-副詞可能 +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach +# to numbers. +# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 +#名詞-接尾-助数詞 +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽し) さ, (考え) 方 +#名詞-接尾-特殊 +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) +#名詞-接続詞的 +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are +# semantically verb-like. +# e.g. ごらん, ご覧, 御覧, 頂戴 +#名詞-動詞非自立的 +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") +# is いわく ("iwaku"). +#名詞-引用文字列 +# +# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and +# behave like an adjective. +# e.g. 申し訳, 仕方, とんでも, 違い +#名詞-ナイ形容詞語幹 +# +##### +# prefix: unclassified prefixes +#接頭詞 +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) +#接頭詞-名詞接続 +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by なる/なさる/くださる. +# e.g. お (読みなさい), お (座り) +#接頭詞-動詞接続 +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. お (寒いですねえ), バカ (でかい) +#接頭詞-形容詞接続 +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. 約, およそ, 毎時 +#接頭詞-数接続 +# +##### +# verb: unclassified verbs +#動詞 +# +# verb-main: +#動詞-自立 +# +# verb-auxiliary: +#動詞-非自立 +# +# verb-suffix: +#動詞-接尾 +# +##### +# adjective: unclassified adjectives +#形容詞 +# +# adjective-main: +#形容詞-自立 +# +# adjective-auxiliary: +#形容詞-非自立 +# +# adjective-suffix: +#形容詞-接尾 +# +##### +# adverb: unclassified adverbs +#副詞 +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. あいかわらず, 多分 +#副詞-一般 +# +# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, +# な, する, だ, etc. +# e.g. こんなに, そんなに, あんなに, なにか, なんでも +#副詞-助詞類接続 +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, +# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, +# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き +#連体詞 +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. が, けれども, そして, じゃあ, それどころか +接続詞 +# +##### +# particle: unclassified particles. +助詞 +# +# particle-case: case particles where the subclassification is undefined. +助詞-格助詞 +# +# particle-case-misc: Case particles. +# e.g. から, が, で, と, に, へ, より, を, の, にて +助詞-格助詞-一般 +# +# particle-case-quote: the "to" that appears after nouns, a person’s speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) +助詞-格助詞-引用 +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, +# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, +# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, +# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, +# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, +# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, +# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, +# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ +助詞-格助詞-連語 +# +# particle-conjunctive: +# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, +# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, +# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ +助詞-接続助詞 +# +# particle-dependency: +# e.g. こそ, さえ, しか, すら, は, も, ぞ +助詞-係助詞 +# +# particle-adverbial: +# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, +# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, +# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, +# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, +# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) +助詞-副助詞 +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (松島) や +助詞-間投助詞 +# +# particle-coordinate: +# e.g. と, たり, だの, だり, とか, なり, や, やら +助詞-並立助詞 +# +# particle-final: +# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, +# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ +助詞-終助詞 +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 +# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 +# 「(祈りが届いたせい) か (, 試験に合格した.)」 +# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 +# e.g. か +助詞-副助詞/並立助詞/終助詞 +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +助詞-連体化 +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. に, と +助詞-副詞化 +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) +助詞-特殊 +# +##### +# auxiliary-verb: +助動詞 +# +##### +# interjection: Greetings and other exclamations. +# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, +# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい +#感動詞 +# +##### +# symbol: unclassified Symbols. +記号 +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [○◎@$〒→+] +記号-一般 +# +# symbol-comma: Commas +# e.g. [,、] +記号-読点 +# +# symbol-period: Periods and full stops. +# e.g. [..。] +記号-句点 +# +# symbol-space: Full-width whitespace. +記号-空白 +# +# symbol-open_bracket: +# e.g. [({‘“『【] +記号-括弧開 +# +# symbol-close_bracket: +# e.g. [)}’”』」】] +記号-括弧閉 +# +# symbol-alphabetic: +#記号-アルファベット +# +##### +# other: unclassified other +#その他 +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (だ)ァ +その他-間投 +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. あの, うんと, えと +フィラー +# +##### +# non-verbal: non-verbal sound. +非言語音 +# +##### +# fragment: +#語断片 +# +##### +# unknown: unknown part of speech. +#未知語 +# +##### End of file diff --git a/defaults/solr/lang/stopwords_ar.txt b/defaults/solr/lang/stopwords_ar.txt new file mode 100644 index 000000000..046829db6 --- /dev/null +++ b/defaults/solr/lang/stopwords_ar.txt @@ -0,0 +1,125 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Cleaned on October 11, 2009 (not normalized, so use before normalization) +# This means that when modifying this list, you might need to add some +# redundant entries, for example containing forms with both أ and ا +من +ومن +منها +منه +في +وفي +فيها +فيه +و +ف +ثم +او +أو +ب +بها +به +ا +أ +اى +اي +أي +أى +لا +ولا +الا +ألا +إلا +لكن +ما +وما +كما +فما +عن +مع +اذا +إذا +ان +أن +إن +انها +أنها +إنها +انه +أنه +إنه +بان +بأن +فان +فأن +وان +وأن +وإن +التى +التي +الذى +الذي +الذين +الى +الي +إلى +إلي +على +عليها +عليه +اما +أما +إما +ايضا +أيضا +كل +وكل +لم +ولم +لن +ولن +هى +هي +هو +وهى +وهي +وهو +فهى +فهي +فهو +انت +أنت +لك +لها +له +هذه +هذا +تلك +ذلك +هناك +كانت +كان +يكون +تكون +وكانت +وكان +غير +بعض +قد +نحو +بين +بينما +منذ +ضمن +حيث +الان +الآن +خلال +بعد +قبل +حتى +عند +عندما +لدى +جميع diff --git a/defaults/solr/lang/stopwords_bg.txt b/defaults/solr/lang/stopwords_bg.txt new file mode 100644 index 000000000..1ae4ba2ae --- /dev/null +++ b/defaults/solr/lang/stopwords_bg.txt @@ -0,0 +1,193 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +а +аз +ако +ала +бе +без +беше +би +бил +била +били +било +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главно +го +д +да +дали +до +докато +докога +дори +досега +доста +е +едва +един +ето +за +зад +заедно +заради +засега +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +ли +м +ме +между +мен +ми +мнозина +мога +могат +може +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нея +ни +ние +никой +нито +но +някои +някой +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първо +с +са +само +се +сега +си +скоро +след +сме +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +тн +то +това +тогава +този +той +толкова +точно +трябва +тук +тъй +тя +тях +у +харесва +ч +че +често +чрез +ще +щом +я diff --git a/defaults/solr/lang/stopwords_ca.txt b/defaults/solr/lang/stopwords_ca.txt new file mode 100644 index 000000000..3da65deaf --- /dev/null +++ b/defaults/solr/lang/stopwords_ca.txt @@ -0,0 +1,220 @@ +# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) +a +abans +ací +ah +així +això +al +als +aleshores +algun +alguna +algunes +alguns +alhora +allà +allí +allò +altra +altre +altres +amb +ambdós +ambdues +apa +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquí +baix +cada +cadascú +cadascuna +cadascunes +cadascuns +com +contra +d'un +d'una +d'unes +d'uns +dalt +de +del +dels +des +després +dins +dintre +donat +doncs +durant +e +eh +el +els +em +en +encara +ens +entre +érem +eren +éreu +es +és +esta +està +estàvem +estaven +estàveu +esteu +et +etc +ets +fins +fora +gairebé +ha +han +has +havia +he +hem +heu +hi +ho +i +igual +iguals +ja +l'hi +la +les +li +li'n +llavors +m'he +ma +mal +malgrat +mateix +mateixa +mateixes +mateixos +me +mentre +més +meu +meus +meva +meves +molt +molta +moltes +molts +mon +mons +n'he +n'hi +ne +ni +no +nogensmenys +només +nosaltres +nostra +nostre +nostres +o +oh +oi +on +pas +pel +pels +per +però +perquè +poc +poca +pocs +poques +potser +propi +qual +quals +quan +quant +que +què +quelcom +qui +quin +quina +quines +quins +s'ha +s'han +sa +semblant +semblants +ses +seu +seus +seva +seva +seves +si +sobre +sobretot +sóc +solament +sols +son +són +sons +sota +sou +t'ha +t'han +t'he +ta +tal +també +tampoc +tan +tant +tanta +tantes +teu +teus +teva +teves +ton +tons +tot +tota +totes +tots +un +una +unes +uns +us +va +vaig +vam +van +vas +veu +vosaltres +vostra +vostre +vostres diff --git a/defaults/solr/lang/stopwords_cz.txt b/defaults/solr/lang/stopwords_cz.txt new file mode 100644 index 000000000..53c6097da --- /dev/null +++ b/defaults/solr/lang/stopwords_cz.txt @@ -0,0 +1,172 @@ +a +s +k +o +i +u +v +z +dnes +cz +tímto +budeš +budem +byli +jseš +můj +svým +ta +tomto +tohle +tuto +tyto +jej +zda +proč +máte +tato +kam +tohoto +kdo +kteří +mi +nám +tom +tomuto +mít +nic +proto +kterou +byla +toho +protože +asi +ho +naši +napište +re +což +tím +takže +svých +její +svými +jste +aj +tu +tedy +teto +bylo +kde +ke +pravé +ji +nad +nejsou +či +pod +téma +mezi +přes +ty +pak +vám +ani +když +však +neg +jsem +tento +článku +články +aby +jsme +před +pta +jejich +byl +ještě +až +bez +také +pouze +první +vaše +která +nás +nový +tipy +pokud +může +strana +jeho +své +jiné +zprávy +nové +není +vás +jen +podle +zde +už +být +více +bude +již +než +který +by +které +co +nebo +ten +tak +má +při +od +po +jsou +jak +další +ale +si +se +ve +to +jako +za +zpět +ze +do +pro +je +na +atd +atp +jakmile +přičemž +já +on +ona +ono +oni +ony +my +vy +jí +ji +mě +mne +jemu +tomu +těm +těmu +němu +němuž +jehož +jíž +jelikož +jež +jakož +načež diff --git a/defaults/solr/lang/stopwords_da.txt b/defaults/solr/lang/stopwords_da.txt new file mode 100644 index 000000000..a3ff5fe12 --- /dev/null +++ b/defaults/solr/lang/stopwords_da.txt @@ -0,0 +1,108 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +på | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/youself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +når | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +også | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sådan | such, like this/like that diff --git a/defaults/solr/lang/stopwords_de.txt b/defaults/solr/lang/stopwords_de.txt new file mode 100644 index 000000000..f77038418 --- /dev/null +++ b/defaults/solr/lang/stopwords_de.txt @@ -0,0 +1,292 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daß | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + diff --git a/defaults/solr/lang/stopwords_el.txt b/defaults/solr/lang/stopwords_el.txt new file mode 100644 index 000000000..232681f5b --- /dev/null +++ b/defaults/solr/lang/stopwords_el.txt @@ -0,0 +1,78 @@ +# Lucene Greek Stopwords list +# Note: by default this file is used after GreekLowerCaseFilter, +# so when modifying this file use 'σ' instead of 'ς' +ο +η +το +οι +τα +του +τησ +των +τον +την +και +κι +κ +ειμαι +εισαι +ειναι +ειμαστε +ειστε +στο +στον +στη +στην +μα +αλλα +απο +για +προσ +με +σε +ωσ +παρα +αντι +κατα +μετα +θα +να +δε +δεν +μη +μην +επι +ενω +εαν +αν +τοτε +που +πωσ +ποιοσ +ποια +ποιο +ποιοι +ποιεσ +ποιων +ποιουσ +αυτοσ +αυτη +αυτο +αυτοι +αυτων +αυτουσ +αυτεσ +αυτα +εκεινοσ +εκεινη +εκεινο +εκεινοι +εκεινεσ +εκεινα +εκεινων +εκεινουσ +οπωσ +ομωσ +ισωσ +οσο +οτι diff --git a/defaults/solr/lang/stopwords_en.txt b/defaults/solr/lang/stopwords_en.txt new file mode 100644 index 000000000..2c164c0b2 --- /dev/null +++ b/defaults/solr/lang/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with diff --git a/defaults/solr/lang/stopwords_es.txt b/defaults/solr/lang/stopwords_es.txt new file mode 100644 index 000000000..2db147600 --- /dev/null +++ b/defaults/solr/lang/stopwords_es.txt @@ -0,0 +1,354 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | from, of +la | the, her +que | who, that +el | the +en | in +y | and +a | to +los | the, them +del | de + el +se | himself, from him etc +las | the, them +por | for, by, etc +un | a +para | for +con | with +no | no +una | a +su | his, her +al | a + el + | es from SER +lo | him +como | how +más | more +pero | pero +sus | su plural +le | to him, her +ya | already +o | or + | fue from SER +este | this + | ha from HABER +sí | himself etc +porque | because +esta | this + | son from SER +entre | between + | está from ESTAR +cuando | when +muy | very +sin | without +sobre | on + | ser from SER + | tiene from TENER +también | also +me | me +hasta | until +hay | there is/are +donde | where + | han from HABER +quien | whom, that + | están from ESTAR + | estado from ESTAR +desde | from +todo | all +nos | us +durante | during + | estados from ESTAR +todos | all +uno | a +les | to them +ni | nor +contra | against +otros | other + | fueron from SER +ese | that +eso | that + | había from HABER +ante | before +ellos | they +e | and (variant of y) +esto | this +mí | me +antes | before +algunos | some +qué | what? +unos | a +yo | I +otro | other +otras | other +otra | other +él | he +tanto | so much, many +esa | that +estos | these +mucho | much, many +quienes | who +nada | nothing +muchos | many +cual | who + | sea from SER +poco | few +ella | she +estar | to be + | haber from HABER +estas | these + | estaba from ESTAR + | estamos from ESTAR +algunas | some +algo | something +nosotros | we + + | other forms + +mi | me +mis | mi plural +tú | thou +te | thee +ti | thee +tu | thy +tus | tu plural +ellas | they +nosotras | we +vosotros | you +vosotras | you +os | you +mío | mine +mía | +míos | +mías | +tuyo | thine +tuya | +tuyos | +tuyas | +suyo | his, hers, theirs +suya | +suyos | +suyas | +nuestro | ours +nuestra | +nuestros | +nuestras | +vuestro | yours +vuestra | +vuestros | +vuestras | +esos | those +esas | those + + | forms of estar, to be (not including the infinitive): +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad + + | forms of haber, to have (not including the infinitive): +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas + + | forms of ser, to be (not including the infinitive): +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +siendo +sido + | sed also means 'thirst' + + | forms of tener, to have (not including the infinitive): +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened + diff --git a/defaults/solr/lang/stopwords_eu.txt b/defaults/solr/lang/stopwords_eu.txt new file mode 100644 index 000000000..25f1db934 --- /dev/null +++ b/defaults/solr/lang/stopwords_eu.txt @@ -0,0 +1,99 @@ +# example set of basque stopwords +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten diff --git a/defaults/solr/lang/stopwords_fa.txt b/defaults/solr/lang/stopwords_fa.txt new file mode 100644 index 000000000..723641c6d --- /dev/null +++ b/defaults/solr/lang/stopwords_fa.txt @@ -0,0 +1,313 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Note: by default this file is used after normalization, so when adding entries +# to this file, use the arabic 'ي' instead of 'ی' +انان +نداشته +سراسر +خياه +ايشان +وي +تاكنون +بيشتري +دوم +پس +ناشي +وگو +يا +داشتند +سپس +هنگام +هرگز +پنج +نشان +امسال +ديگر +گروهي +شدند +چطور +ده +و +دو +نخستين +ولي +چرا +چه +وسط +ه +كدام +قابل +يك +رفت +هفت +همچنين +در +هزار +بله +بلي +شايد +اما +شناسي +گرفته +دهد +داشته +دانست +داشتن +خواهيم +ميليارد +وقتيكه +امد +خواهد +جز +اورده +شده +بلكه +خدمات +شدن +برخي +نبود +بسياري +جلوگيري +حق +كردند +نوعي +بعري +نكرده +نظير +نبايد +بوده +بودن +داد +اورد +هست +جايي +شود +دنبال +داده +بايد +سابق +هيچ +همان +انجا +كمتر +كجاست +گردد +كسي +تر +مردم +تان +دادن +بودند +سري +جدا +ندارند +مگر +يكديگر +دارد +دهند +بنابراين +هنگامي +سمت +جا +انچه +خود +دادند +زياد +دارند +اثر +بدون +بهترين +بيشتر +البته +به +براساس +بيرون +كرد +بعضي +گرفت +توي +اي +ميليون +او +جريان +تول +بر +مانند +برابر +باشيم +مدتي +گويند +اكنون +تا +تنها +جديد +چند +بي +نشده +كردن +كردم +گويد +كرده +كنيم +نمي +نزد +روي +قصد +فقط +بالاي +ديگران +اين +ديروز +توسط +سوم +ايم +دانند +سوي +استفاده +شما +كنار +داريم +ساخته +طور +امده +رفته +نخست +بيست +نزديك +طي +كنيد +از +انها +تمامي +داشت +يكي +طريق +اش +چيست +روب +نمايد +گفت +چندين +چيزي +تواند +ام +ايا +با +ان +ايد +ترين +اينكه +ديگري +راه +هايي +بروز +همچنان +پاعين +كس +حدود +مختلف +مقابل +چيز +گيرد +ندارد +ضد +همچون +سازي +شان +مورد +باره +مرسي +خويش +برخوردار +چون +خارج +شش +هنوز +تحت +ضمن +هستيم +گفته +فكر +بسيار +پيش +براي +روزهاي +انكه +نخواهد +بالا +كل +وقتي +كي +چنين +كه +گيري +نيست +است +كجا +كند +نيز +يابد +بندي +حتي +توانند +عقب +خواست +كنند +بين +تمام +همه +ما +باشند +مثل +شد +اري +باشد +اره +طبق +بعد +اگر +صورت +غير +جاي +بيش +ريزي +اند +زيرا +چگونه +بار +لطفا +مي +درباره +من +ديده +همين +گذاري +برداري +علت +گذاشته +هم +فوق +نه +ها +شوند +اباد +همواره +هر +اول +خواهند +چهار +نام +امروز +مان +هاي +قبل +كنم +سعي +تازه +را +هستند +زير +جلوي +عنوان +بود diff --git a/defaults/solr/lang/stopwords_fi.txt b/defaults/solr/lang/stopwords_fi.txt new file mode 100644 index 000000000..addad798c --- /dev/null +++ b/defaults/solr/lang/stopwords_fi.txt @@ -0,0 +1,95 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this +tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +niin | so +nyt | now +itse | self + diff --git a/defaults/solr/lang/stopwords_fr.txt b/defaults/solr/lang/stopwords_fr.txt new file mode 100644 index 000000000..c00837ea9 --- /dev/null +++ b/defaults/solr/lang/stopwords_fr.txt @@ -0,0 +1,183 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +au | a + le +aux | a + les +avec | with +ce | this +ces | these +dans | with +de | of +des | de + les +du | de + le +elle | she +en | `of them' etc +et | and +eux | them +il | he +je | I +la | the +le | the +leur | their +lui | him +ma | my (fem) +mais | but +me | me +même | same; as in moi-même (myself) etc +mes | me (pl) +moi | me +mon | my (masc) +ne | not +nos | our (pl) +notre | our +nous | we +on | one +ou | where +par | by +pas | not +pour | for +qu | que before vowel +que | that +qui | who +sa | his, her (fem) +se | oneself +ses | his (pl) +son | his, her (masc) +sur | on +ta | thy (fem) +te | thee +tes | thy (pl) +toi | thee +ton | thy (masc) +tu | thou +un | a +une | a +vos | your (pl) +votre | your +vous | you + + | single letter forms + +c | c' +d | d' +j | j' +l | l' +à | to, at +m | m' +n | n' +s | s' +t | t' +y | there + + | forms of être (not including the infinitive): +été +étée +étées +étés +étant +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent + + | forms of avoir (not including the infinitive): +ayant +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent + + | Later additions (from Jean-Christophe Deschamps) +ceci | this +celà  | that +cet | this +cette | this +ici | here +ils | they +les | the (pl) +leurs | their (pl) +quel | which +quels | which +quelle | which +quelles | which +sans | without +soi | oneself + diff --git a/defaults/solr/lang/stopwords_ga.txt b/defaults/solr/lang/stopwords_ga.txt new file mode 100644 index 000000000..9ff88d747 --- /dev/null +++ b/defaults/solr/lang/stopwords_ga.txt @@ -0,0 +1,110 @@ + +a +ach +ag +agus +an +aon +ar +arna +as +b' +ba +beirt +bhúr +caoga +ceathair +ceathrar +chomh +chtó +chuig +chun +cois +céad +cúig +cúigear +d' +daichead +dar +de +deich +deichniúr +den +dhá +do +don +dtí +dá +dár +dó +faoi +faoin +faoina +faoinár +fara +fiche +gach +gan +go +gur +haon +hocht +i +iad +idir +in +ina +ins +inár +is +le +leis +lena +lenár +m' +mar +mo +mé +na +nach +naoi +naonúr +ná +ní +níor +nó +nócha +ocht +ochtar +os +roimh +sa +seacht +seachtar +seachtó +seasca +seisear +siad +sibh +sinn +sna +sé +sí +tar +thar +thú +triúr +trí +trína +trínár +tríocha +tú +um +ár +é +éis +í +ó +ón +óna +ónár diff --git a/defaults/solr/lang/stopwords_gl.txt b/defaults/solr/lang/stopwords_gl.txt new file mode 100644 index 000000000..d8760b12c --- /dev/null +++ b/defaults/solr/lang/stopwords_gl.txt @@ -0,0 +1,161 @@ +# galican stopwords +a +aínda +alí +aquel +aquela +aquelas +aqueles +aquilo +aquí +ao +aos +as +así +á +ben +cando +che +co +coa +comigo +con +connosco +contigo +convosco +coas +cos +cun +cuns +cunha +cunhas +da +dalgunha +dalgunhas +dalgún +dalgúns +das +de +del +dela +delas +deles +desde +deste +do +dos +dun +duns +dunha +dunhas +e +el +ela +elas +eles +en +era +eran +esa +esas +ese +eses +esta +estar +estaba +está +están +este +estes +estiven +estou +eu +é +facer +foi +foron +fun +había +hai +iso +isto +la +las +lle +lles +lo +los +mais +me +meu +meus +min +miña +miñas +moi +na +nas +neste +nin +no +non +nos +nosa +nosas +noso +nosos +nós +nun +nunha +nuns +nunhas +o +os +ou +ó +ós +para +pero +pode +pois +pola +polas +polo +polos +por +que +se +senón +ser +seu +seus +sexa +sido +sobre +súa +súas +tamén +tan +te +ten +teñen +teño +ter +teu +teus +ti +tido +tiña +tiven +túa +túas +un +unha +unhas +uns +vos +vosa +vosas +voso +vosos +vós diff --git a/defaults/solr/lang/stopwords_hi.txt b/defaults/solr/lang/stopwords_hi.txt new file mode 100644 index 000000000..86286bb08 --- /dev/null +++ b/defaults/solr/lang/stopwords_hi.txt @@ -0,0 +1,235 @@ +# Also see http://www.opensource.org/licenses/bsd-license.html +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# This file was created by Jacques Savoy and is distributed under the BSD license. +# Note: by default this file also contains forms normalized by HindiNormalizer +# for spelling variation (see section below), such that it can be used whether or +# not you enable that feature. When adding additional entries to this list, +# please add the normalized form as well. +अंदर +अत +अपना +अपनी +अपने +अभी +आदि +आप +इत्यादि +इन +इनका +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हों +उस +उसके +उसी +उसे +एक +एवं +एस +ऐसे +और +कई +कर +करता +करते +करना +करने +करें +कहते +कहा +का +काफ़ी +कि +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गया +घर +जब +जहाँ +जा +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसे +जीधर +जैसा +जैसे +जो +तक +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तो +था +थी +थे +दबारा +दिया +दुसरा +दूसरे +दो +द्वारा +न +नहीं +ना +निहायत +नीचे +ने +पर +पर +पहले +पूरा +पे +फिर +बनी +बही +बहुत +बाद +बाला +बिलकुल +भी +भीतर +मगर +मानो +मे +में +यदि +यह +यहाँ +यही +या +यिह +ये +रखें +रहा +रहे +ऱ्वासा +लिए +लिये +लेकिन +व +वर्ग +वह +वह +वहाँ +वहीं +वाले +वुह +वे +वग़ैरह +संग +सकता +सकते +सबसे +सभी +साथ +साबुत +साभ +सारा +से +सो +ही +हुआ +हुई +हुए +है +हैं +हो +होता +होती +होते +होना +होने +# additional normalized forms of the above +अपनि +जेसे +होति +सभि +तिंहों +इंहों +दवारा +इसि +किंहें +थि +उंहों +ओर +जिंहें +वहिं +अभि +बनि +हि +उंहिं +उंहें +हें +वगेरह +एसे +रवासा +कोन +निचे +काफि +उसि +पुरा +भितर +हे +बहि +वहां +कोइ +यहां +जिंहों +तिंहें +किसि +कइ +यहि +इंहिं +जिधर +इंहें +अदि +इतयादि +हुइ +कोनसा +इसकि +दुसरे +जहां +अप +किंहों +उनकि +भि +वरग +हुअ +जेसा +नहिं diff --git a/defaults/solr/lang/stopwords_hu.txt b/defaults/solr/lang/stopwords_hu.txt new file mode 100644 index 000000000..1a96f1db6 --- /dev/null +++ b/defaults/solr/lang/stopwords_hu.txt @@ -0,0 +1,209 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + +| Hungarian stop word list +| prepared by Anna Tordai + +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +ő +ők +őket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/defaults/solr/lang/stopwords_hy.txt b/defaults/solr/lang/stopwords_hy.txt new file mode 100644 index 000000000..60c1c50fb --- /dev/null +++ b/defaults/solr/lang/stopwords_hy.txt @@ -0,0 +1,46 @@ +# example set of Armenian stopwords. +այդ +այլ +այն +այս +դու +դուք +եմ +են +ենք +ես +եք +է +էի +էին +էինք +էիր +էիք +էր +ըստ +թ +ի +ին +իսկ +իր +կամ +համար +հետ +հետո +մենք +մեջ +մի +ն +նա +նաև +նրա +նրանք +որ +որը +որոնք +որպես +ու +ում +պիտի +վրա +և diff --git a/defaults/solr/lang/stopwords_id.txt b/defaults/solr/lang/stopwords_id.txt new file mode 100644 index 000000000..4617f83a5 --- /dev/null +++ b/defaults/solr/lang/stopwords_id.txt @@ -0,0 +1,359 @@ +# from appendix D of: A Study of Stemming Effects on Information +# Retrieval in Bahasa Indonesia +ada +adanya +adalah +adapun +agak +agaknya +agar +akan +akankah +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +diantaranya +antara +antaranya +diantara +apa +apaan +mengapa +apabila +apakah +apalagi +apatah +atau +ataukah +ataupun +bagai +bagaikan +sebagai +sebagainya +bagaimana +bagaimanapun +sebagaimana +bagaimanakah +bagi +bahkan +bahwa +bahwasanya +sebaliknya +banyak +sebanyak +beberapa +seberapa +begini +beginian +beginikah +beginilah +sebegini +begitu +begitukah +begitulah +begitupun +sebegitu +belum +belumlah +sebelum +sebelumnya +sebenarnya +berapa +berapakah +berapalah +berapapun +betulkah +sebetulnya +biasa +biasanya +bila +bilakah +bisa +bisakah +sebisanya +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +cuma +percuma +dahulu +dalam +dan +dapat +dari +daripada +dekat +demi +demikian +demikianlah +sedemikian +dengan +depan +di +dia +dialah +dini +diri +dirinya +terdiri +dong +dulu +enggak +enggaknya +entah +entahlah +terhadap +terhadapnya +hal +hampir +hanya +hanyalah +harus +haruslah +harusnya +seharusnya +hendak +hendaklah +hendaknya +hingga +sehingga +ia +ialah +ibarat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jangan +jangankan +janganlah +jika +jikalau +juga +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +dikarenakan +karena +karenanya +ke +kecil +kemudian +kenapa +kepada +kepadanya +ketika +seketika +khususnya +kini +kinilah +kiranya +sekiranya +kita +kitalah +kok +lagi +lagian +selagi +lah +lain +lainnya +melainkan +selaku +lalu +melalui +terlalu +lama +lamanya +selama +selama +selamanya +lebih +terlebih +bermacam +macam +semacam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masih +masihkah +semasih +masing +mau +maupun +semaunya +memang +mereka +merekalah +meski +meskipun +semula +mungkin +mungkinkah +nah +namun +nanti +nantinya +nyaris +oleh +olehnya +seorang +seseorang +pada +padanya +padahal +paling +sepanjang +pantas +sepantasnya +sepantasnyalah +para +pasti +pastilah +per +pernah +pula +pun +merupakan +rupanya +serupa +saat +saatnya +sesaat +saja +sajalah +saling +bersama +sama +sesama +sambil +sampai +sana +sangat +sangatlah +saya +sayalah +se +sebab +sebabnya +sebuah +tersebut +tersebutlah +sedang +sedangkan +sedikit +sedikitnya +segala +segalanya +segera +sesegera +sejak +sejenak +sekali +sekalian +sekalipun +sesekali +sekaligus +sekarang +sekarang +sekitar +sekitarnya +sela +selain +selalu +seluruh +seluruhnya +semakin +sementara +sempat +semua +semuanya +sendiri +sendirinya +seolah +seperti +sepertinya +sering +seringnya +serta +siapa +siapakah +siapapun +disini +disinilah +sini +sinilah +sesuatu +sesuatunya +suatu +sesudah +sesudahnya +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tak +tanpa +setelah +telah +tentang +tentu +tentulah +tentunya +tertentu +seterusnya +tapi +tetapi +setiap +tiap +setidaknya +tidak +tidakkah +tidaklah +toh +waduh +wah +wahai +sewaktu +walau +walaupun +wong +yaitu +yakni +yang diff --git a/defaults/solr/lang/stopwords_it.txt b/defaults/solr/lang/stopwords_it.txt new file mode 100644 index 000000000..4cb5b0891 --- /dev/null +++ b/defaults/solr/lang/stopwords_it.txt @@ -0,0 +1,301 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | An Italian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +ad | a (to) before vowel +al | a + il +allo | a + lo +ai | a + i +agli | a + gli +all | a + l' +agl | a + gl' +alla | a + la +alle | a + le +con | with +col | con + il +coi | con + i (forms collo, cogli etc are now very rare) +da | from +dal | da + il +dallo | da + lo +dai | da + i +dagli | da + gli +dall | da + l' +dagl | da + gll' +dalla | da + la +dalle | da + le +di | of +del | di + il +dello | di + lo +dei | di + i +degli | di + gli +dell | di + l' +degl | di + gl' +della | di + la +delle | di + le +in | in +nel | in + el +nello | in + lo +nei | in + i +negli | in + gli +nell | in + l' +negl | in + gl' +nella | in + la +nelle | in + le +su | on +sul | su + il +sullo | su + lo +sui | su + i +sugli | su + gli +sull | su + l' +sugl | su + gl' +sulla | su + la +sulle | su + le +per | through, by +tra | among +contro | against +io | I +tu | thou +lui | he +lei | she +noi | we +voi | you +loro | they +mio | my +mia | +miei | +mie | +tuo | +tua | +tuoi | thy +tue | +suo | +sua | +suoi | his, her +sue | +nostro | our +nostra | +nostri | +nostre | +vostro | your +vostra | +vostri | +vostre | +mi | me +ti | thee +ci | us, there +vi | you, there +lo | him, the +la | her, the +li | them +le | them, the +gli | to him, the +ne | from there etc +il | the +un | a +uno | a +una | a +ma | but +ed | and +se | if +perché | why, because +anche | also +come | how +dov | where (as dov') +dove | where +che | who, that +chi | who +cui | whom +non | not +più | more +quale | who, that +quanto | how much +quanti | +quanta | +quante | +quello | that +quelli | +quella | +quelle | +questo | this +questi | +questa | +queste | +si | yes +tutto | all +tutti | all + + | single letter forms: + +a | at +c | as c' for ce or ci +e | and +i | the +l | as l' +o | or + + | forms of avere, to have (not including the infinitive): + +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute + + | forms of essere, to be (not including the infinitive): +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo + + | forms of fare, to do (not including the infinitive, fa, fat-): +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo + + | forms of stare, to be (not including the infinitive): +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/defaults/solr/lang/stopwords_ja.txt b/defaults/solr/lang/stopwords_ja.txt new file mode 100644 index 000000000..d4321be6b --- /dev/null +++ b/defaults/solr/lang/stopwords_ja.txt @@ -0,0 +1,127 @@ +# +# This file defines a stopword set for Japanese. +# +# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. +# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 +# for frequency lists, etc. that can be useful for making your own set (if desired) +# +# Note that there is an overlap between these stopwords and the terms stopped when used +# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note +# that comments are not allowed on the same line as stopwords. +# +# Also note that stopping is done in a case-insensitive manner. Change your StopFilter +# configuration if you need case-sensitive stopping. Lastly, note that stopping is done +# using the same character width as the entries in this file. Since this StopFilter is +# normally done after a CJKWidthFilter in your chain, you would usually want your romaji +# entries to be in half-width and your kana entries to be in full-width. +# +の +に +は +を +た +が +で +て +と +し +れ +さ +ある +いる +も +する +から +な +こと +として +い +や +れる +など +なっ +ない +この +ため +その +あっ +よう +また +もの +という +あり +まで +られ +なる +へ +か +だ +これ +によって +により +おり +より +による +ず +なり +られる +において +ば +なかっ +なく +しかし +について +せ +だっ +その後 +できる +それ +う +ので +なお +のみ +でき +き +つ +における +および +いう +さらに +でも +ら +たり +その他 +に関する +たち +ます +ん +なら +に対して +特に +せる +及び +これら +とき +では +にて +ほか +ながら +うち +そして +とともに +ただし +かつて +それぞれ +または +お +ほど +ものの +に対する +ほとんど +と共に +といった +です +とも +ところ +ここ +##### End of file diff --git a/defaults/solr/lang/stopwords_lv.txt b/defaults/solr/lang/stopwords_lv.txt new file mode 100644 index 000000000..e21a23c06 --- /dev/null +++ b/defaults/solr/lang/stopwords_lv.txt @@ -0,0 +1,172 @@ +# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins +# the original list of over 800 forms was refined: +# pronouns, adverbs, interjections were removed +# +# prepositions +aiz +ap +ar +apakš +ārpus +augšpus +bez +caur +dēļ +gar +iekš +iz +kopš +labad +lejpus +līdz +no +otrpus +pa +par +pār +pēc +pie +pirms +pret +priekš +starp +šaipus +uz +viņpus +virs +virspus +zem +apakšpus +# Conjunctions +un +bet +jo +ja +ka +lai +tomēr +tikko +turpretī +arī +kaut +gan +tādēļ +tā +ne +tikvien +vien +kā +ir +te +vai +kamēr +# Particles +ar +diezin +droši +diemžēl +nebūt +ik +it +taču +nu +pat +tiklab +iekšpus +nedz +tik +nevis +turpretim +jeb +iekam +iekām +iekāms +kolīdz +līdzko +tiklīdz +jebšu +tālab +tāpēc +nekā +itin +jā +jau +jel +nē +nezin +tad +tikai +vis +tak +iekams +vien +# modal verbs +būt +biju +biji +bija +bijām +bijāt +esmu +esi +esam +esat +būšu +būsi +būs +būsim +būsiet +tikt +tiku +tiki +tika +tikām +tikāt +tieku +tiec +tiek +tiekam +tiekat +tikšu +tiks +tiksim +tiksiet +tapt +tapi +tapāt +topat +tapšu +tapsi +taps +tapsim +tapsiet +kļūt +kļuvu +kļuvi +kļuva +kļuvām +kļuvāt +kļūstu +kļūsti +kļūst +kļūstam +kļūstat +kļūšu +kļūsi +kļūs +kļūsim +kļūsiet +# verbs +varēt +varēju +varējām +varēšu +varēsim +var +varēji +varējāt +varēsi +varēsiet +varat +varēja +varēs diff --git a/defaults/solr/lang/stopwords_nl.txt b/defaults/solr/lang/stopwords_nl.txt new file mode 100644 index 000000000..f4d61f509 --- /dev/null +++ b/defaults/solr/lang/stopwords_nl.txt @@ -0,0 +1,117 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other diff --git a/defaults/solr/lang/stopwords_no.txt b/defaults/solr/lang/stopwords_no.txt new file mode 100644 index 000000000..e76f36e69 --- /dev/null +++ b/defaults/solr/lang/stopwords_no.txt @@ -0,0 +1,192 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +som | who/which/that +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +være | to be +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ikkje | not * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +si | his/hers * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + diff --git a/defaults/solr/lang/stopwords_pt.txt b/defaults/solr/lang/stopwords_pt.txt new file mode 100644 index 000000000..276c1b446 --- /dev/null +++ b/defaults/solr/lang/stopwords_pt.txt @@ -0,0 +1,251 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | of, from +a | the; to, at; her +o | the; him +que | who, that +e | and +do | de + o +da | de + a +em | in +um | a +para | for + | é from SER +com | with +não | not, no +uma | a +os | the; them +no | em + o +se | himself etc +na | em + a +por | for +mais | more +as | the; them +dos | de + os +como | as, like +mas | but + | foi from SER +ao | a + o +ele | he +das | de + as + | tem from TER +à | a + a +seu | his +sua | her +ou | or + | ser from SER +quando | when +muito | much + | há from HAV +nos | em + os; us +já | already, now + | está from EST +eu | I +também | also +só | only, just +pelo | per + o +pela | per + a +até | up to +isso | that +ela | he +entre | between + | era from SER +depois | after +sem | without +mesmo | same +aos | a + os + | ter from TER +seus | his +quem | whom +nas | em + as +me | me +esse | that +eles | they + | estão from EST +você | you + | tinha from TER + | foram from SER +essa | that +num | em + um +nem | nor +suas | her +meu | my +às | a + as +minha | my + | têm from TER +numa | em + uma +pelos | per + os +elas | they + | havia from HAV + | seja from SER +qual | which + | será from SER +nós | we + | tenho from TER +lhe | to him, her +deles | of them +essas | those +esses | those +pelas | per + as +este | this + | fosse from SER +dele | of him + + | other words. There are many contractions such as naquele = em+aquele, + | mo = me+o, but they are rare. + | Indefinite article plural forms are also rare. + +tu | thou +te | thee +vocês | you (plural) +vos | you +lhes | to them +meus | my +minhas +teu | thy +tua +teus +tuas +nosso | our +nossa +nossos +nossas + +dela | of her +delas | of them + +esta | this +estes | these +estas | these +aquele | that +aquela | that +aqueles | those +aquelas | those +isto | this +aquilo | that + + | forms of estar, to be (not including the infinitive): +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem + + | forms of haver, to have (not including the infinitive): +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam + + | forms of ser, to be (not including the infinitive): +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam + + | forms of ter, to have (not including the infinitive): +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/defaults/solr/lang/stopwords_ro.txt b/defaults/solr/lang/stopwords_ro.txt new file mode 100644 index 000000000..4fdee90a5 --- /dev/null +++ b/defaults/solr/lang/stopwords_ro.txt @@ -0,0 +1,233 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +acea +aceasta +această +aceea +acei +aceia +acel +acela +acele +acelea +acest +acesta +aceste +acestea +aceşti +aceştia +acolo +acum +ai +aia +aibă +aici +al +ăla +ale +alea +ălea +altceva +altcineva +am +ar +are +aş +aşadar +asemenea +asta +ăsta +astăzi +astea +ăstea +ăştia +asupra +aţi +au +avea +avem +aveţi +azi +bine +bucur +bună +ca +că +căci +când +care +cărei +căror +cărui +cât +câte +câţi +către +câtva +ce +cel +ceva +chiar +cînd +cine +cineva +cît +cîte +cîţi +cîtva +contra +cu +cum +cumva +curând +curînd +da +dă +dacă +dar +datorită +de +deci +deja +deoarece +departe +deşi +din +dinaintea +dintr +dintre +drept +după +ea +ei +el +ele +eram +este +eşti +eu +face +fără +fi +fie +fiecare +fii +fim +fiţi +iar +ieri +îi +îl +îmi +împotriva +în +înainte +înaintea +încât +încît +încotro +între +întrucât +întrucît +îţi +la +lângă +le +li +lîngă +lor +lui +mă +mâine +mea +mei +mele +mereu +meu +mi +mine +mult +multă +mulţi +ne +nicăieri +nici +nimeni +nişte +noastră +noastre +noi +noştri +nostru +nu +ori +oricând +oricare +oricât +orice +oricînd +oricine +oricît +oricum +oriunde +până +pe +pentru +peste +pînă +poate +pot +prea +prima +primul +prin +printr +sa +să +săi +sale +sau +său +se +şi +sînt +sîntem +sînteţi +spre +sub +sunt +suntem +sunteţi +ta +tăi +tale +tău +te +ţi +ţie +tine +toată +toate +tot +toţi +totuşi +tu +un +una +unde +undeva +unei +unele +uneori +unor +vă +vi +voastră +voastre +voi +voştri +vostru +vouă +vreo +vreun diff --git a/defaults/solr/lang/stopwords_ru.txt b/defaults/solr/lang/stopwords_ru.txt new file mode 100644 index 000000000..643076934 --- /dev/null +++ b/defaults/solr/lang/stopwords_ru.txt @@ -0,0 +1,241 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter `ё' is translated to `е'. + +и | and +в | in/into +во | alternative form +не | not +что | what/that +он | he +на | on/onto +я | i +с | from +со | alternative form +как | how +а | milder form of `no' (but) +то | conjunction and form of `that' +все | all +она | she +так | so, thus +его | him +но | but +да | yes/and +ты | thou +к | towards, by +у | around, chez +же | intensifier particle +вы | you +за | beyond, behind +бы | conditional/subj. particle +по | up to, along +только | only +ее | her +мне | to me +было | it was +вот | here is/are, particle +от | away from +меня | me +еще | still, yet, more +нет | no, there isnt/arent +о | about +из | out of +ему | to him +теперь | now +когда | when +даже | even +ну | so, well +вдруг | suddenly +ли | interrogative particle +если | if +уже | already, but homonym of `narrower' +или | or +ни | neither +быть | to be +был | he was +него | prepositional form of его +до | up to +вас | you accusative +нибудь | indef. suffix preceded by hyphen +опять | again +уж | already, but homonym of `adder' +вам | to you +сказал | he said +ведь | particle `after all' +там | there +потом | then +себя | oneself +ничего | nothing +ей | to her +может | usually with `быть' as `maybe' +они | they +тут | here +где | where +есть | there is/are +надо | got to, must +ней | prepositional form of ей +для | for +мы | we +тебя | thee +их | them, their +чем | than +была | she was +сам | self +чтоб | in order to +без | without +будто | as if +человек | man, person, one +чего | genitive form of `what' +раз | once +тоже | also +себе | to oneself +под | beneath +жизнь | life +будет | will be +ж | short form of intensifer particle `же' +тогда | then +кто | who +этот | this +говорил | was saying +того | genitive form of `that' +потому | for that reason +этого | genitive form of `this' +какой | which +совсем | altogether +ним | prepositional form of `его', `они' +здесь | here +этом | prepositional form of `этот' +один | one +почти | almost +мой | my +тем | instrumental/dative plural of `тот', `то' +чтобы | full form of `in order that' +нее | her (acc.) +кажется | it seems +сейчас | now +были | they were +куда | where to +зачем | why +сказать | to say +всех | all (acc., gen. preposn. plural) +никогда | never +сегодня | today +можно | possible, one can +при | by +наконец | finally +два | two +об | alternative form of `о', about +другой | another +хоть | even +после | after +над | above +больше | more +тот | that one (masc.) +через | across, in +эти | these +нас | us +про | about +всего | in all, only, of all +них | prepositional form of `они' (they) +какая | which, feminine +много | lots +разве | interrogative particle +сказала | she said +три | three +эту | this, acc. fem. sing. +моя | my, feminine +впрочем | moreover, besides +хорошо | good +свою | ones own, acc. fem. sing. +этой | oblique form of `эта', fem. `this' +перед | in front of +иногда | sometimes +лучше | better +чуть | a little +том | preposn. form of `that one' +нельзя | one must not +такой | such a one +им | to them +более | more +всегда | always +конечно | of course +всю | acc. fem. sing of `all' +между | between + + + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs `to be', `to have', `to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя + diff --git a/defaults/solr/lang/stopwords_sv.txt b/defaults/solr/lang/stopwords_sv.txt new file mode 100644 index 000000000..22bddfd8c --- /dev/null +++ b/defaults/solr/lang/stopwords_sv.txt @@ -0,0 +1,131 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitta | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + diff --git a/defaults/solr/lang/stopwords_th.txt b/defaults/solr/lang/stopwords_th.txt new file mode 100644 index 000000000..07f0fabe6 --- /dev/null +++ b/defaults/solr/lang/stopwords_th.txt @@ -0,0 +1,119 @@ +# Thai stopwords from: +# "Opinion Detection in Thai Political News Columns +# Based on Subjectivity Analysis" +# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak +ไว้ +ไม่ +ไป +ได้ +ให้ +ใน +โดย +แห่ง +แล้ว +และ +แรก +แบบ +แต่ +เอง +เห็น +เลย +เริ่ม +เรา +เมื่อ +เพื่อ +เพราะ +เป็นการ +เป็น +เปิดเผย +เปิด +เนื่องจาก +เดียวกัน +เดียว +เช่น +เฉพาะ +เคย +เข้า +เขา +อีก +อาจ +อะไร +ออก +อย่าง +อยู่ +อยาก +หาก +หลาย +หลังจาก +หลัง +หรือ +หนึ่ง +ส่วน +ส่ง +สุด +สําหรับ +ว่า +วัน +ลง +ร่วม +ราย +รับ +ระหว่าง +รวม +ยัง +มี +มาก +มา +พร้อม +พบ +ผ่าน +ผล +บาง +น่า +นี้ +นํา +นั้น +นัก +นอกจาก +ทุก +ที่สุด +ที่ +ทําให้ +ทํา +ทาง +ทั้งนี้ +ทั้ง +ถ้า +ถูก +ถึง +ต้อง +ต่างๆ +ต่าง +ต่อ +ตาม +ตั้งแต่ +ตั้ง +ด้าน +ด้วย +ดัง +ซึ่ง +ช่วง +จึง +จาก +จัด +จะ +คือ +ความ +ครั้ง +คง +ขึ้น +ของ +ขอ +ขณะ +ก่อน +ก็ +การ +กับ +กัน +กว่า +กล่าว diff --git a/defaults/solr/lang/stopwords_tr.txt b/defaults/solr/lang/stopwords_tr.txt new file mode 100644 index 000000000..84d9408d4 --- /dev/null +++ b/defaults/solr/lang/stopwords_tr.txt @@ -0,0 +1,212 @@ +# Turkish stopwords from LUCENE-559 +# merged with the list from "Information Retrieval on Turkish Texts" +# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) +acaba +altmış +altı +ama +ancak +arada +aslında +ayrıca +bana +bazı +belki +ben +benden +beni +benim +beri +beş +bile +bin +bir +birçok +biri +birkaç +birkez +birşey +birşeyi +biz +bize +bizden +bizi +bizim +böyle +böylece +bu +buna +bunda +bundan +bunlar +bunları +bunların +bunu +bunun +burada +çok +çünkü +da +daha +dahi +de +defa +değil +diğer +diye +doksan +dokuz +dolayı +dolayısıyla +dört +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +eğer +elli +en +etmesi +etti +ettiği +ettiğini +gibi +göre +halen +hangi +hatta +hem +henüz +hep +hepsi +her +herhangi +herkesin +hiç +hiçbir +için +iki +ile +ilgili +ise +işte +itibaren +itibariyle +kadar +karşın +katrilyon +kendi +kendilerine +kendini +kendisi +kendisine +kendisini +kez +ki +kim +kimden +kime +kimi +kimse +kırk +milyar +milyon +mu +mü +mı +nasıl +ne +neden +nedenle +nerde +nerede +nereye +niye +niçin +o +olan +olarak +oldu +olduğu +olduğunu +olduklarını +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor +on +ona +ondan +onlar +onlardan +onları +onların +onu +onun +otuz +oysa +öyle +pek +rağmen +sadece +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +şey +şeyden +şeyi +şeyler +şöyle +şu +şuna +şunda +şundan +şunları +şunu +tarafından +trilyon +tüm +üç +üzere +var +vardı +ve +veya +ya +yani +yapacak +yapılan +yapılması +yapıyor +yapmak +yaptı +yaptığı +yaptığını +yaptıkları +yedi +yerine +yetmiş +yine +yirmi +yoksa +yüz +zaten diff --git a/defaults/solr/protwords.txt b/defaults/solr/protwords.txt new file mode 100644 index 000000000..1dfc0abec --- /dev/null +++ b/defaults/solr/protwords.txt @@ -0,0 +1,21 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# Use a protected word file to protect against the stemmer reducing two +# unrelated words to the same base word. + +# Some non-words that normally won't be encountered, +# just to test that they won't be stemmed. +dontstems +zwhacky + diff --git a/defaults/solr/schema.xml b/defaults/solr/schema.xml new file mode 100755 index 000000000..52b36042a --- /dev/null +++ b/defaults/solr/schema.xml @@ -0,0 +1,1012 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/defaults/solr/solr.xml b/defaults/solr/solr.xml new file mode 100644 index 000000000..6d4d9f0e1 --- /dev/null +++ b/defaults/solr/solr.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/defaults/solr/solrconfig.xml b/defaults/solr/solrconfig.xml new file mode 100755 index 000000000..5f50ab580 --- /dev/null +++ b/defaults/solr/solrconfig.xml @@ -0,0 +1,1622 @@ + + + + + + + + + ${solr.abortOnConfigurationError:true} + + + LUCENE_36 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1024 + + + + + + + + + + + + + + + + + + + + + + true + + + + + + 20 + + + 200 + + + + + + + + + + + + static firstSearcher warming in solrconfig.xml + + + + + + false + + + 2 + + + + + + + + + + + + + + + + + + + + + + + explicit + 10 + text + + + + + + + + + + + + + + explicit + + + velocity + + browse + layout + Solritas + + text + edismax + *:* + 10 + *,score + + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + + text,features,name,sku,id,manu,cat + 3 + + + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + + + on + cat + manu_exact + ipod + GB + 1 + cat,inStock + after + price + 0 + 600 + 50 + popularity + 0 + 10 + 3 + manufacturedate_dt + NOW/YEAR-10YEARS + NOW + +1YEAR + before + after + + + + on + text features name + 0 + name + + + spellcheck + + + + + + + + + + + + + + + + + + + + + + + text + true + ignored_ + + + true + links + ignored_ + + + + + + + + + + + + + + + + + + + + + + + + solrpingquery + + + all + + + + + + + explicit + true + + + + + + + + + + + + textSpell + + + + + + default + name + spellchecker + + + + + + + + + + + + + + + + + + text + false + false + 1 + + + spellcheck + + + + + + + + + + text + true + + + tvComponent + + + + + + + + + default + + + org.carrot2.clustering.lingo.LingoClusteringAlgorithm + + + 20 + + + clustering/carrot2 + + + ENGLISH + + + stc + org.carrot2.clustering.stc.STCClusteringAlgorithm + + + + + + + true + default + true + + name + id + + features + + true + + + + false + + text + edismax + + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + + *:* + 10 + *,score + + + clustering + + + + + + + + + + true + + + terms + + + + + + + + string + elevate.xml + + + + + + explicit + text + + + elevator + + + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + + + ,, + ,, + ,, + ,, + ,]]> + ]]> + + + + + + 10 + .,!? + + + + + + + WORD + + en + US + + + + + + + + + + + + + + + + + + + text/plain; charset=UTF-8 + + + + + + + + + 5 + + + + + + + + + + + + + *:* + + + + + + diff --git a/defaults/solr/stopwords.txt b/defaults/solr/stopwords.txt new file mode 100644 index 000000000..ae1e83eeb --- /dev/null +++ b/defaults/solr/stopwords.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/defaults/solr/synonyms.txt b/defaults/solr/synonyms.txt new file mode 100644 index 000000000..7f7212830 --- /dev/null +++ b/defaults/solr/synonyms.txt @@ -0,0 +1,29 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaafoo => aaabar +bbbfoo => bbbfoo bbbbar +cccfoo => cccbar cccbaz +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma + diff --git a/defaults/yacy.init b/defaults/yacy.init index b09bed54f..e9d67aaf3 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1047,12 +1047,18 @@ color_searchurlhover = #008000 # - extract the solr (3.1) package, 'cd example' and start solr with 'java -jar start.jar' # - start yacy and then start a crawler. The crawler will fill both, YaCy and solr indexes. # - to check whats in solr after indexing, open http://localhost:8983/solr/admin/ -federated.service.yacy.indexing.enabled = true federated.service.solr.indexing.enabled = false federated.service.solr.indexing.url = http://127.0.0.1:8983/solr +federated.service.solr.indexing.commitWithinMs = 180000 federated.service.solr.indexing.sharding = MODULO_HOST_MD5 federated.service.solr.indexing.schemefile = solr.keys.default.list +# the indexing engine in YaCy can be switched off or on +# (off may make sense if federated.service.solr.indexing.enabled = true) +# for experiments the value federated.service.yacy.indexing.engine = solr may be used +# allowed values are: classic, solr, off +federated.service.yacy.indexing.engine = classic + # RDF triplestore settings triplestore.persistent = true diff --git a/htroot/Blog.java b/htroot/Blog.java index a15c93350..bb8777dee 100644 --- a/htroot/Blog.java +++ b/htroot/Blog.java @@ -30,8 +30,8 @@ // javac -classpath .:../classes Blog.java // if the shell's current path is HTROOT -import java.util.Date; import java.text.DateFormat; +import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -39,6 +39,7 @@ import java.util.Locale; import java.util.Map; import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.peers.NewsPool; @@ -53,12 +54,12 @@ public class Blog { private static final String DEFAULT_PAGE = "blog_default"; private static DateFormat SimpleFormatter = DateFormat.getDateTimeInstance(DateFormat.DEFAULT,DateFormat.DEFAULT, Locale.getDefault()); - + /** * print localized date/time "yyyy/mm/dd HH:mm:ss" * @param date - * @return - */ + * @return + */ public static String dateString(final Date date) { return SimpleFormatter.format(date); } @@ -100,7 +101,7 @@ public class Blog { } String pagename = post.get("page", DEFAULT_PAGE); - final String ip = header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "127.0.0.1"); + final String ip = header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); String strAuthor = post.get("author", "anonymous"); diff --git a/htroot/BlogComments.java b/htroot/BlogComments.java index 048e066e5..2a54b49af 100644 --- a/htroot/BlogComments.java +++ b/htroot/BlogComments.java @@ -36,12 +36,15 @@ import java.util.Date; import java.util.Iterator; import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.FileUtils; import net.yacy.peers.Network; import net.yacy.search.Switchboard; + +import com.google.common.io.Files; + import de.anomic.data.BlogBoard; import de.anomic.data.BlogBoard.BlogEntry; import de.anomic.data.BlogBoardComments; @@ -53,7 +56,7 @@ import de.anomic.server.serverSwitch; public class BlogComments { private static final String DEFAULT_PAGE = "blog_default"; - + public static String dateString(final Date date) { return Blog.dateString(date); } @@ -81,7 +84,7 @@ public class BlogComments { } String pagename = post.get("page", DEFAULT_PAGE); - final String ip = post.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "127.0.0.1"); + final String ip = post.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); String strAuthor = post.get("author", "anonymous"); @@ -142,7 +145,7 @@ public class BlogComments { final File notifierSource = new File(sb.getAppPath(), sb.getConfig("htRootPath","htroot") + "/env/grafics/message.gif"); final File notifierDest = new File(sb.getDataPath("htDocsPath", "DATA/HTDOCS"), "notifier.gif"); try { - FileUtils.copy(notifierSource, notifierDest); + Files.copy(notifierSource, notifierDest); } catch (final IOException e) { Log.logSevere("MESSAGE", "NEW MESSAGE ARRIVED! (error: " + e.getMessage() + ")"); diff --git a/htroot/CacheResource_p.java b/htroot/CacheResource_p.java index 091480c64..490bd2d4a 100644 --- a/htroot/CacheResource_p.java +++ b/htroot/CacheResource_p.java @@ -1,4 +1,4 @@ -// CacheResource_p.java +// CacheResource_p.java // ----------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de @@ -30,7 +30,6 @@ import net.yacy.cora.protocol.ResponseHeader; import net.yacy.document.ImageParser; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; - import de.anomic.crawler.Cache; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -41,9 +40,9 @@ public class CacheResource_p { public static Object respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final servletProperties prop = new servletProperties(); prop.put("resource", new byte[0]); - + if (post == null) return prop; - + final String u = post.get("url", ""); DigestURI url; try { @@ -52,10 +51,10 @@ public class CacheResource_p { Log.logException(e); return prop; } - + byte[] resource = Cache.getContent(url.hash()); if (resource == null) return prop; - + // check request type if (header.get("EXT", "html").equals("png")) { // a png was requested @@ -65,11 +64,11 @@ public class CacheResource_p { ResponseHeader responseHeader = Cache.getResponseHeader(url.hash()); String resMime = responseHeader == null ? null : responseHeader.mime(); if (resMime != null) { - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); outgoingHeader.put(HeaderFramework.CONTENT_TYPE, resMime); prop.setOutgoingHeader(outgoingHeader); - } - + } + // add resource prop.put("resource", resource); return prop; diff --git a/htroot/ConfigAccounts_p.java b/htroot/ConfigAccounts_p.java index 3afa0611d..d9ab259f9 100644 --- a/htroot/ConfigAccounts_p.java +++ b/htroot/ConfigAccounts_p.java @@ -1,4 +1,4 @@ -//Config_Accounts_p.java +//Config_Accounts_p.java //----------------------- //part of the AnomicHTTPD caching proxy //(C) by Michael Peter Christen; mc@yacy.net @@ -29,25 +29,25 @@ //javac -classpath .:../Classes Message.java //if the shell's current path is HTROOT +import java.util.EnumMap; import java.util.HashMap; import java.util.Iterator; +import java.util.Map; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Digest; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; - import de.anomic.data.UserDB; import de.anomic.data.UserDB.AccessRight; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -import java.util.EnumMap; -import java.util.Map; public class ConfigAccounts_p { - + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final serverObjects prop = new serverObjects(); @@ -57,7 +57,7 @@ public class ConfigAccounts_p { // admin password boolean localhostAccess = sb.getConfigBool("adminAccountForLocalhost", false); if (post != null && post.containsKey("setAdmin")) { - localhostAccess = "localhost".equals(post.get("access", "")); + localhostAccess = Domains.isLocalhost(post.get("access", "")); final String user = (post == null) ? "" : post.get("adminuser", ""); final String pw1 = (post == null) ? "" : post.get("adminpw1", ""); final String pw2 = (post == null) ? "" : post.get("adminpw2", ""); @@ -68,7 +68,7 @@ public class ConfigAccounts_p { env.setConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, Digest.encodeMD5Hex(Base64Order.standardCoder.encodeString(user + ":" + pw1))); env.setConfig("adminAccount", ""); } - + if (localhostAccess) { sb.setConfig("adminAccountForLocalhost", true); @@ -87,16 +87,16 @@ public class ConfigAccounts_p { } } } - + if (env.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, "").length() == 0 && !env.getConfigBool("adminAccountForLocalhost", false)) { prop.put("passwordNotSetWarning", 1); } - + prop.put("localhost.checked", (localhostAccess) ? 1 : 0); prop.put("account.checked", (localhostAccess) ? 0 : 1); prop.put("statusPassword", localhostAccess ? "0" : "1"); prop.put("defaultUser", "admin"); - + //default values prop.put("current_user", "newuser"); prop.put("username", ""); @@ -115,13 +115,13 @@ public class ConfigAccounts_p { c++; } prop.put("rights", c); - + prop.put("users", "0"); - + if (sb.userDB == null) { return prop; } - + if (post == null) { //do nothing @@ -130,7 +130,7 @@ public class ConfigAccounts_p { //current_user = edited user } else if (post.containsKey("user") && !"newuser".equals(post.get("user"))){ if (post.containsKey("change_user")) { - //defaults for newuser are set above + //defaults for newuser are set above entry = sb.userDB.getEntry(post.get("user")); // program crashes if a submit with empty username was made on previous mask and the user clicked on the // link: "If you want to manage more Users, return to the user page." (parameter "user" is empty) @@ -176,10 +176,10 @@ public class ConfigAccounts_p { for(final AccessRight right : rights) { rightsSet.put(right, post.containsKey(right.toString()) && "on".equals(post.get(right.toString())) ? "true" : "false"); } - + final Map mem = new HashMap(); if( "newuser".equals(post.get("current_user"))){ //new user - + if (!"".equals(pw1)) { //change only if set mem.put(UserDB.Entry.MD5ENCODED_USERPWD_STRING, Digest.encodeMD5Hex(username + ":" + pw1)); } @@ -202,7 +202,7 @@ public class ConfigAccounts_p { } catch (final IllegalArgumentException e) { prop.put("error", "3"); } - + } else { //edit user entry = sb.userDB.getEntry(username); @@ -235,7 +235,7 @@ public class ConfigAccounts_p { }//edit user prop.putHTML("username", username); } - + //Generate Userlist final Iterator it = sb.userDB.iterator(true); int numUsers=0; diff --git a/htroot/ConfigAppearance_p.java b/htroot/ConfigAppearance_p.java index 1bddfa01b..331c11c5f 100644 --- a/htroot/ConfigAppearance_p.java +++ b/htroot/ConfigAppearance_p.java @@ -1,4 +1,4 @@ -// ConfigAppearance_p.java +// ConfigAppearance_p.java // ----------------------- // part of YaCy // (C) by Michael Peter Christen; mc@yacy.net @@ -14,7 +14,7 @@ //$LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -34,6 +34,7 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; +import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -44,9 +45,10 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; +import com.google.common.io.Files; + import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -import java.util.Collections; public class ConfigAppearance_p { @@ -77,7 +79,7 @@ public class ConfigAppearance_p { if (skinFiles.contains(selectedSkin)) { changeSkin(sb, skinPath, selectedSkin); } - + } if (post.containsKey("delete_button")) { @@ -91,7 +93,7 @@ public class ConfigAppearance_p { FileUtils.deletedelete(skinfile); } } - + if (post.containsKey("install_button")) { // load skin from URL final String url = post.get("url"); @@ -112,7 +114,7 @@ public class ConfigAppearance_p { while (it.hasNext()) { bw.write(it.next() + "\n"); } - + bw.close(); } catch (final IOException e) { prop.put("status", "2");// error saving the skin @@ -148,7 +150,7 @@ public class ConfigAppearance_p { } prop.put("skinlist", count); prop.putHTML("currentskin", env.getConfig("currentSkin", "default")); - + // write colors from generic skin Iterator i = env.configKeys(); while (i.hasNext()) { @@ -165,7 +167,7 @@ public class ConfigAppearance_p { styleFile.getParentFile().mkdirs(); try { - FileUtils.copy(skinFile, styleFile); + Files.copy(skinFile, styleFile); sb.setConfig("currentSkin", skin.substring(0, skin.length() - 4)); return true; } catch (final IOException e) { diff --git a/htroot/CookieTest_p.java b/htroot/CookieTest_p.java index db478fdf0..84278482a 100644 --- a/htroot/CookieTest_p.java +++ b/htroot/CookieTest_p.java @@ -31,7 +31,6 @@ import java.util.Iterator; import java.util.Map; import net.yacy.cora.protocol.ResponseHeader; - import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.servletProperties; @@ -39,7 +38,7 @@ import de.anomic.server.servletProperties; public class CookieTest_p { public static serverObjects respond(final ResponseHeader header, final serverObjects post, final serverSwitch env) { - + // case if no values are requested if (post == null || env == null) { @@ -48,10 +47,10 @@ public class CookieTest_p { final serverObjects prop = new serverObjects(); return prop; } - + final servletProperties prop = new servletProperties(); if (post.containsKey("act") && "clear_cookie".equals(post.get("act"))) { - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); final Iterator> it = header.entrySet().iterator(); Map.Entry e; while (it.hasNext()) { @@ -65,15 +64,15 @@ public class CookieTest_p { } } } - + prop.setOutgoingHeader(outgoingHeader); prop.put("coockiesout", "0"); //header. - + } else if (post.containsKey("act") && "set_cookie".equals(post.get("act"))) { final String cookieName = post.get("cookie_name").trim(); final String cookieValue = post.get("cookie_value").trim(); - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); outgoingHeader.setCookie(cookieName,cookieValue); prop.setOutgoingHeader(outgoingHeader); diff --git a/htroot/CrawlStartScanner_p.java b/htroot/CrawlStartScanner_p.java index fe542f696..f2f96feeb 100644 --- a/htroot/CrawlStartScanner_p.java +++ b/htroot/CrawlStartScanner_p.java @@ -207,7 +207,7 @@ public class CrawlStartScanner_p "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99&directDocByURL=off"; path += "&crawlingURL=" + url.toNormalform(true, false); WorkTables.execAPICall( - "localhost", + Domains.LOCALHOST, (int) sb.getConfigLong("port", 8090), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), path, @@ -254,7 +254,7 @@ public class CrawlStartScanner_p "/Crawler_p.html?createBookmark=off&xsstopw=off&crawlingDomMaxPages=10000&intention=&range=domain&indexMedia=on&recrawl=nodoubles&xdstopw=off&storeHTCache=on&sitemapURL=&repeat_time=7&crawlingQ=on&cachePolicy=iffresh&indexText=on&crawlingMode=url&mustnotmatch=&crawlingDomFilterDepth=1&crawlingDomFilterCheck=off&crawlingstart=Start%20New%20Crawl&xpstopw=off&repeat_unit=seldays&crawlingDepth=99"; path += "&crawlingURL=" + urlString; WorkTables.execAPICall( - "localhost", + Domains.LOCALHOST, (int) sb.getConfigLong("port", 8090), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), path, diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index b0a0c983a..74c89f991 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -455,7 +455,7 @@ public class Crawler_p { prop.put("info", "6"); // Error with url prop.putHTML("info_crawlingStart", crawlingStart); prop.putHTML("info_error", e.getMessage()); - Log.logException(e); + Log.logInfo("Crawler_p", "start url rejected: " + e.getMessage()); } } else if ("file".equals(crawlingMode)) { diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 4a8733f12..aa1997815 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -92,7 +92,7 @@ public class IndexControlRWIs_p prop.put("keyhash", ""); prop.put("result", ""); prop.put("cleanup", post == null || post.containsKey("maxReferencesLimit") ? 1 : 0); - prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null + prop.put("cleanup_solr", sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getRemoteSolr() == null || !sb.getConfigBool("federated.service.solr.indexing.enabled", false) ? 0 : 1); String segmentName = sb.getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); @@ -180,7 +180,7 @@ public class IndexControlRWIs_p if ( post.get("deleteSolr", "").equals("on") && sb.getConfigBool("federated.service.solr.indexing.enabled", false) ) { try { - sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().clear(); + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getRemoteSolr().clear(); } catch ( final Exception e ) { Log.logException(e); } diff --git a/htroot/IndexFederated_p.html b/htroot/IndexFederated_p.html index 8d2ebdb72..4ae4a2e7b 100644 --- a/htroot/IndexFederated_p.html +++ b/htroot/IndexFederated_p.html @@ -21,17 +21,20 @@
- - + The built-in search index can either be 'classic' (as before YaCy 1.03), 'solr' (experimental since 1.03) and 'off' (useful only if a remote solr index is used) - You can just switch on or off this index. If you switch it off, you will not be able to search with YaCy any more. +
+
embedded 'classic' search index
+
embedded solr search index
+
no local index
+
+
- - +
- - + + You can set one or more Solr targets here. If you wish to set several targets, then list them in the 'Solr URL' field using a ',' (comma) as separator. @@ -58,6 +61,8 @@ #(/table)#
Solr URL(s)
+
Commit-Within (milliseconds)
+
Sharding Method
Scheme
diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index fce03a821..aca303562 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -30,17 +30,17 @@ import java.util.Iterator; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.services.federated.solr.ShardSelection; +import net.yacy.cora.services.federated.solr.ShardSolrConnector; +import net.yacy.cora.services.federated.solr.SingleSolrConnector; import net.yacy.cora.services.federated.solr.SolrConnector; -import net.yacy.cora.services.federated.solr.SolrShardingConnector; -import net.yacy.cora.services.federated.solr.SolrShardingSelection; -import net.yacy.cora.services.federated.solr.SolrSingleConnector; import net.yacy.cora.storage.ConfigurationSet; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import net.yacy.search.index.Segments; +import net.yacy.search.index.SolrField; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -import net.yacy.search.index.SolrField; public class IndexFederated_p { @@ -51,13 +51,15 @@ public class IndexFederated_p { if (post != null && post.containsKey("set")) { // yacy - env.setConfig("federated.service.yacy.indexing.enabled", post.getBoolean("yacy.indexing.enabled", false)); + String localindex = post.get("yacy.indexing", "off"); + env.setConfig("federated.service.yacy.indexing.engine", localindex); // solr final boolean solrWasOn = env.getConfigBool("federated.service.solr.indexing.enabled", true); - final boolean solrIsOnAfterwards = post.getBoolean("solr.indexing.enabled", false); + final boolean solrIsOnAfterwards = post.getBoolean("solr.indexing.solrremote", false); env.setConfig("federated.service.solr.indexing.enabled", solrIsOnAfterwards); String solrurls = post.get("solr.indexing.url", env.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr")); + int commitWithinMs = post.getInt("solr.indexing.commitWithinMs", env.getConfigInt("federated.service.solr.indexing.commitWithinMs", 180000)); final BufferedReader r = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(UTF8.getBytes(solrurls)))); final StringBuilder s = new StringBuilder(); String s0; @@ -75,24 +77,31 @@ public class IndexFederated_p { } solrurls = s.toString().trim(); env.setConfig("federated.service.solr.indexing.url", solrurls); + env.setConfig("federated.service.solr.indexing.commitWithinMs", commitWithinMs); env.setConfig("federated.service.solr.indexing.sharding", post.get("solr.indexing.sharding", env.getConfig("federated.service.solr.indexing.sharding", "modulo-host-md5"))); final String schemename = post.get("solr.indexing.schemefile", env.getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list")); env.setConfig("federated.service.solr.indexing.schemefile", schemename); if (solrWasOn) { // switch off - sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().close(); - sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null); + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getRemoteSolr().close(); + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectRemoteSolr(null); } if (solrIsOnAfterwards) { // switch on final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; try { - sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrShardingConnector(solrurls, SolrShardingSelection.Method.MODULO_HOST_MD5, 10000, true) : null); + if (usesolr) { + SolrConnector solr = new ShardSolrConnector(solrurls, ShardSelection.Method.MODULO_HOST_MD5, 10000, true); + solr.setCommitWithinMs(commitWithinMs); + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectRemoteSolr(solr); + } else { + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectRemoteSolr(null); + } } catch (final IOException e) { Log.logException(e); - sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null); + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectRemoteSolr(null); } } @@ -127,13 +136,13 @@ public class IndexFederated_p { } // show solr host table - if (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() == null) { + if (sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getRemoteSolr() == null) { prop.put("table", 0); } else { prop.put("table", 1); - final SolrConnector solr = sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr(); - final long[] size = (solr instanceof SolrShardingConnector) ? ((SolrShardingConnector) solr).getSizeList() : new long[]{((SolrSingleConnector) solr).getSize()}; - final String[] urls = (solr instanceof SolrShardingConnector) ? ((SolrShardingConnector) solr).getAdminInterfaceList() : new String[]{((SolrSingleConnector) solr).getAdminInterface()}; + final SolrConnector solr = sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getRemoteSolr(); + final long[] size = (solr instanceof ShardSolrConnector) ? ((ShardSolrConnector) solr).getSizeList() : new long[]{((SingleSolrConnector) solr).getSize()}; + final String[] urls = (solr instanceof ShardSolrConnector) ? ((ShardSolrConnector) solr).getAdminInterfaceList() : new String[]{((SingleSolrConnector) solr).getAdminInterface()}; boolean dark = false; for (int i = 0; i < size.length; i++) { prop.put("table_list_" + i + "_dark", dark ? 1 : 0); dark = !dark; @@ -171,9 +180,14 @@ public class IndexFederated_p { prop.put("scheme", c); // fill attribute fields - prop.put("yacy.indexing.enabled.checked", env.getConfigBool("federated.service.yacy.indexing.enabled", true) ? 1 : 0); - prop.put("solr.indexing.enabled.checked", env.getConfigBool("federated.service.solr.indexing.enabled", false) ? 1 : 0); + // allowed values are: classic, solr, off + // federated.service.yacy.indexing.engine = classic + prop.put("yacy.indexing.engine.classic.checked", env.getConfig("federated.service.yacy.indexing.engine", "classic").equals("classic") ? 1 : 0); + prop.put("yacy.indexing.engine.solr.checked", env.getConfig("federated.service.yacy.indexing.engine", "classic").equals("solr") ? 1 : 0); + prop.put("yacy.indexing.engine.off.checked", env.getConfig("federated.service.yacy.indexing.engine", "classic").equals("off") ? 1 : 0); + prop.put("solr.indexing.solrremote.checked", env.getConfigBool("federated.service.solr.indexing.enabled", false) ? 1 : 0); prop.put("solr.indexing.url", env.getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr").replace(",", "\n")); + prop.put("solr.indexing.commitWithinMs", env.getConfigInt("federated.service.solr.indexing.commitWithinMs", 180000)); prop.put("solr.indexing.sharding", env.getConfig("federated.service.solr.indexing.sharding", "modulo-host-md5")); prop.put("solr.indexing.schemefile", schemename); diff --git a/htroot/Messages_p.java b/htroot/Messages_p.java index 5af5162f6..5b182972b 100644 --- a/htroot/Messages_p.java +++ b/htroot/Messages_p.java @@ -1,4 +1,4 @@ -// Messages_p.java +// Messages_p.java // ----------------------- // part of the AnomicHTTPD caching proxy // (C) by Michael Peter Christen; mc@yacy.net @@ -35,10 +35,11 @@ import java.util.TreeMap; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.kelondro.util.FileUtils; import net.yacy.peers.Seed; import net.yacy.search.Switchboard; +import com.google.common.io.Files; + import de.anomic.data.MessageBoard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -98,7 +99,7 @@ public class Messages_p { final File notifierSource = new File(sb.getAppPath(), sb.getConfig("htRootPath", "htroot") + "/env/grafics/empty.gif"); final File notifierDest = new File(sb.getDataPath("htDocsPath", "DATA/HTDOCS"), "notifier.gif"); try { - FileUtils.copy(notifierSource, notifierDest); + Files.copy(notifierSource, notifierDest); } catch (final IOException e) { } diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index a43cd36b2..e1105dcda 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -33,6 +33,7 @@ import java.net.MalformedURLException; import java.util.Date; import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; @@ -88,7 +89,7 @@ public class QuickCrawlLink_p { //host = hostSocket.substring(0, pos); } - prop.put("mode_host", "localhost"); + prop.put("mode_host", Domains.LOCALHOST); prop.put("mode_port", port); return prop; diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 9ff001aae..65d897eaa 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -114,7 +114,7 @@ public class SettingsAck_p { final InetSocketAddress theNewAddress = theServerCore.generateSocketAddress(port); final String hostName = Domains.getHostName(theNewAddress.getAddress()); prop.put("info_restart", "1"); - prop.put("info_restart_ip",(hostName.equals("0.0.0.0"))? "localhost" : hostName); + prop.put("info_restart_ip",(hostName.equals("0.0.0.0"))? Domains.LOCALHOST : hostName); prop.put("info_restart_port", theNewAddress.getPort()); env.setConfig("port", port); diff --git a/htroot/Steering.java b/htroot/Steering.java index 2424d2e1e..3b1fcd44b 100644 --- a/htroot/Steering.java +++ b/htroot/Steering.java @@ -29,6 +29,7 @@ import java.io.File; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.logging.Log; @@ -46,7 +47,7 @@ public class Steering { final serverObjects prop = new serverObjects(); prop.put("info", "0"); //no information submitted - final String requestIP = post.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "127.0.0.1"); + final String requestIP = post.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); // handle access rights if (!sb.verifyAuthentication(header)) { diff --git a/htroot/Table_API_p.java b/htroot/Table_API_p.java index 9b2bb29f6..c6ae5e26a 100644 --- a/htroot/Table_API_p.java +++ b/htroot/Table_API_p.java @@ -29,6 +29,7 @@ import java.util.TreeSet; import java.util.regex.Pattern; import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.index.RowSpaceExceededException; @@ -170,7 +171,7 @@ public class Table_API_p { } // now call the api URLs and store the result status - final Map l = sb.tables.execAPICalls("localhost", (int) sb.getConfigLong("port", 8090), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), pks); + final Map l = sb.tables.execAPICalls(Domains.LOCALHOST, (int) sb.getConfigLong("port", 8090), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), pks); // construct result table prop.put("showexec", l.size() > 0 ? 1 : 0); diff --git a/htroot/User.java b/htroot/User.java index 69e478a7a..0de54d8ee 100644 --- a/htroot/User.java +++ b/htroot/User.java @@ -112,7 +112,7 @@ public class User{ cookie=sb.userDB.getAdminCookie(); if(entry != null || staticAdmin){ - final ResponseHeader outgoingHeader=new ResponseHeader(); + final ResponseHeader outgoingHeader=new ResponseHeader(200); outgoingHeader.setCookie("login", cookie); prop.setOutgoingHeader(outgoingHeader); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index eee86329e..b12a94921 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -32,6 +32,7 @@ import java.io.InputStream; import java.net.MalformedURLException; import java.util.Map; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; @@ -67,7 +68,7 @@ public class ViewImage { String urlString = post.get("url", ""); final String urlLicense = post.get("code", ""); - final boolean auth = (header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") || sb.verifyAuthentication(header); // handle access rights + final boolean auth = Domains.isLocalhost(header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "")) || sb.verifyAuthentication(header); // handle access rights DigestURI url = null; if ((urlString.length() > 0) && (auth)) try { diff --git a/htroot/Wiki.java b/htroot/Wiki.java index a0e949bdd..a23deca12 100644 --- a/htroot/Wiki.java +++ b/htroot/Wiki.java @@ -39,6 +39,7 @@ import java.util.Locale; import java.util.Map; import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.util.ByteBuffer; @@ -73,7 +74,7 @@ public class Wiki { String access = sb.getConfig("WikiAccess", "admin"); final String pagename = get(post, "page", "start"); - final String ip = get(post, HeaderFramework.CONNECTION_PROP_CLIENTIP, "127.0.0.1"); + final String ip = get(post, HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); String author = get(post, "author", ANONYMOUS); if (author.equals(ANONYMOUS)) { author = WikiBoard.guessAuthor(ip); diff --git a/htroot/opensearchdescription.java b/htroot/opensearchdescription.java index 9f4276f30..82236d737 100644 --- a/htroot/opensearchdescription.java +++ b/htroot/opensearchdescription.java @@ -24,6 +24,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; @@ -39,7 +40,7 @@ public class opensearchdescription { String promoteSearchPageGreeting = env.getConfig(SwitchboardConstants.GREETING, ""); if (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) promoteSearchPageGreeting = env.getConfig("network.unit.description", ""); - String thisaddress = header.get("Host", "127.0.0.1"); + String thisaddress = header.get("Host", Domains.LOCALHOST); if (thisaddress.indexOf(':',0) == -1) thisaddress += ":" + serverCore.getPortNr(env.getConfig("port", "8090")); int compareyacy = 0; diff --git a/htroot/suggest.java b/htroot/suggest.java index 7ac179525..cb685796e 100644 --- a/htroot/suggest.java +++ b/htroot/suggest.java @@ -120,7 +120,7 @@ public class suggest { // Adding CORS Access header for xml output if (xml) { - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*"); prop.setOutgoingHeader(outgoingHeader); } diff --git a/htroot/yacy/message.java b/htroot/yacy/message.java index f4446f997..592951e7e 100644 --- a/htroot/yacy/message.java +++ b/htroot/yacy/message.java @@ -40,11 +40,13 @@ import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.FileUtils; -import net.yacy.peers.Seed; -import net.yacy.peers.Protocol; import net.yacy.peers.Network; +import net.yacy.peers.Protocol; +import net.yacy.peers.Seed; import net.yacy.search.Switchboard; + +import com.google.common.io.Files; + import de.anomic.data.MessageBoard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -150,7 +152,7 @@ public final class message { final File notifierSource = new File(sb.getAppPath(), sb.getConfig("htRootPath","htroot") + "/env/grafics/message.gif"); final File notifierDest = new File(sb.getDataPath("htDocsPath", "DATA/HTDOCS"), "notifier.gif"); try { - FileUtils.copy(notifierSource, notifierDest); + Files.copy(notifierSource, notifierDest); } catch (final IOException e) { Log.logSevere("MESSAGE", "NEW MESSAGE ARRIVED! (error: " + e.getMessage() + ")"); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 65734d6c8..43f5ddc9c 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -205,7 +205,7 @@ public class yacysearch { // Adding CORS Access header for yacysearch.rss output if ( rss ) { - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*"); prop.setOutgoingHeader(outgoingHeader); } @@ -1075,7 +1075,7 @@ public class yacysearch { prop.put("depth", "0"); // adding some additional properties needed for the rss feed - String hostName = header.get("Host", "localhost"); + String hostName = header.get("Host", Domains.LOCALHOST); if ( hostName.indexOf(':', 0) == -1 ) { hostName += ":" + serverCore.getPortNr(env.getConfig("port", "8090")); } @@ -1136,7 +1136,7 @@ public class yacysearch { // hostname and port (assume locahost if nothing helps) final InetAddress hostIP = Domains.myPublicLocalIP(); - prop.put("myhost", hostIP != null ? hostIP.getHostAddress() : "localhost"); + prop.put("myhost", hostIP != null ? hostIP.getHostAddress() : Domains.LOCALHOST); prop.put("myport", serverCore.getPortNr(sb.getConfig("port", "8090"))); // return rewrite properties diff --git a/htroot/yacysearch_location.java b/htroot/yacysearch_location.java index f1c78180b..b2d702bdb 100644 --- a/htroot/yacysearch_location.java +++ b/htroot/yacysearch_location.java @@ -24,6 +24,7 @@ import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.opensearch.SRURSSConnector; @@ -119,7 +120,7 @@ public class yacysearch_location { if (post == null) return prop; String promoteSearchPageGreeting = env.getConfig(SwitchboardConstants.GREETING, ""); if (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) promoteSearchPageGreeting = env.getConfig("network.unit.description", ""); - String hostName = header.get("Host", "localhost"); + String hostName = header.get("Host", Domains.LOCALHOST); if (hostName.indexOf(':',0) == -1) hostName += ":" + serverCore.getPortNr(env.getConfig("port", "8090")); final String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); // SRU compliance final boolean global = post.get("kml_resource", "local").equals("global"); diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 4bb9e50ac..d1fe7ec0f 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -31,6 +31,7 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Classification; import net.yacy.cora.document.Classification.ContentDomain; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader.FileType; @@ -70,7 +71,7 @@ public class yacysearchitem { final String eventID = post.get("eventID", ""); final boolean authenticated = sb.adminAuthenticated(header) >= 2; final int item = post.getInt("item", -1); - final boolean auth = (header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "")).equals("localhost") || sb.verifyAuthentication(header); + final boolean auth = Domains.isLocalhost(header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "")) || sb.verifyAuthentication(header); final RequestHeader.FileType fileType = header.fileType(); // default settings for blank item diff --git a/lib/apache-solr-core-3.6.0.jar b/lib/apache-solr-core-3.6.0.jar new file mode 100644 index 000000000..e1bb4f03c Binary files /dev/null and b/lib/apache-solr-core-3.6.0.jar differ diff --git a/lib/commons-httpclient-3.1.jar b/lib/commons-httpclient-3.1.jar new file mode 100644 index 000000000..7c59774ae Binary files /dev/null and b/lib/commons-httpclient-3.1.jar differ diff --git a/lib/commons-lang-2.6.jar b/lib/commons-lang-2.6.jar new file mode 100644 index 000000000..98467d3a6 Binary files /dev/null and b/lib/commons-lang-2.6.jar differ diff --git a/lib/dependencies.txt b/lib/dependencies.txt index ecf091210..b74ad1fc8 100644 --- a/lib/dependencies.txt +++ b/lib/dependencies.txt @@ -1,6 +1,11 @@ list of library-dependencies: -* apache-solr-solrj-3.4.0.jar depens on: +* jetty (these files were taken from the solr 3.6.0 example) +jetty-6.1.26-patched-JETTY-1340.jar +jetty-util-6.1.26-patched-JETTY-1340.jar +servlet-api-2.5-20081211.jar + +* apache-solr-solrj-3.4.0.jar depends on: commons-codec-1.4.jar commons-httpclient-3.1.jar commons-io-1.4.jar @@ -9,6 +14,29 @@ jcl-over-slf4j-1.6.1.jar slf4j-api-1.6.1.jar wstx-asl-3.2.7.jar +* apache-solr-core-3.6.0.jar depends on +commons-codec-1.6.jar +commons-fileupload-1.2.1.jar +commons-httpclient-3.1.jar +commons-httpclient-3.1.jar +commons-io-2.1.jar +commons-lang-2.6.jar +geronimo-stax-api_1.0_spec-1.0.1.jar +guava-r05.jar +httpclient-4.2.jar +httpcore-4.2.jar +jcl-over-slf4j-1.6.1.jar +log4j-over-slf4j-1.6.1.jar +lucene-analyzers-3.6.0.jar +lucene-core-3.6.0.jar +lucene-highlighter-3.6.0.jar +lucene-phonetic-3.6.0.jar +lucene-spatial-3.6.0.jar +lucene-spellchecker-3.6.0.jar +slf4j-api-1.6.1.jar +slf4j-jdk14-1.6.1.jar +wstx-asl-3.2.7.jar + * pdfbox-1.6.0.jar depends on: fontbox-1.6.0.jar jempbox-1.6.0.jar diff --git a/lib/fontbox-1.6.License b/lib/fontbox-1.7.0.License similarity index 100% rename from lib/fontbox-1.6.License rename to lib/fontbox-1.7.0.License diff --git a/lib/fontbox-1.6.0.jar b/lib/fontbox-1.7.0.jar similarity index 71% rename from lib/fontbox-1.6.0.jar rename to lib/fontbox-1.7.0.jar index c3492fc29..2f3fbe2c3 100644 Binary files a/lib/fontbox-1.6.0.jar and b/lib/fontbox-1.7.0.jar differ diff --git a/lib/guava-r05.jar b/lib/guava-r05.jar new file mode 100644 index 000000000..0407b9c01 Binary files /dev/null and b/lib/guava-r05.jar differ diff --git a/lib/jempbox-1.6.0.License b/lib/jempbox-1.7.0.License similarity index 100% rename from lib/jempbox-1.6.0.License rename to lib/jempbox-1.7.0.License diff --git a/lib/jempbox-1.6.0.jar b/lib/jempbox-1.7.0.jar similarity index 78% rename from lib/jempbox-1.6.0.jar rename to lib/jempbox-1.7.0.jar index 5ca6d7a58..770b39bd6 100644 Binary files a/lib/jempbox-1.6.0.jar and b/lib/jempbox-1.7.0.jar differ diff --git a/lib/jetty-6.1.26-patched-JETTY-1340.jar b/lib/jetty-6.1.26-patched-JETTY-1340.jar new file mode 100644 index 000000000..6be492c92 Binary files /dev/null and b/lib/jetty-6.1.26-patched-JETTY-1340.jar differ diff --git a/lib/servlet-api.License b/lib/jetty-LICENSE-ASL.txt similarity index 99% rename from lib/servlet-api.License rename to lib/jetty-LICENSE-ASL.txt index 261eeb9e9..d64569567 100644 --- a/lib/servlet-api.License +++ b/lib/jetty-LICENSE-ASL.txt @@ -1,3 +1,4 @@ + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ diff --git a/lib/jetty-util-6.1.26-patched-JETTY-1340.jar b/lib/jetty-util-6.1.26-patched-JETTY-1340.jar new file mode 100644 index 000000000..1a9ace88d Binary files /dev/null and b/lib/jetty-util-6.1.26-patched-JETTY-1340.jar differ diff --git a/lib/jetty-util-LICENSE-ASL.txt b/lib/jetty-util-LICENSE-ASL.txt new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/lib/jetty-util-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lib/log4j-over-slf4j-1.6.1.jar b/lib/log4j-over-slf4j-1.6.1.jar new file mode 100644 index 000000000..c4025f42e Binary files /dev/null and b/lib/log4j-over-slf4j-1.6.1.jar differ diff --git a/lib/lucene-analyzers-3.6.0.jar b/lib/lucene-analyzers-3.6.0.jar new file mode 100644 index 000000000..d7e5ccf77 Binary files /dev/null and b/lib/lucene-analyzers-3.6.0.jar differ diff --git a/lib/lucene-core-3.6.0.jar b/lib/lucene-core-3.6.0.jar new file mode 100644 index 000000000..5cb8dae5b Binary files /dev/null and b/lib/lucene-core-3.6.0.jar differ diff --git a/lib/lucene-highlighter-3.6.0.jar b/lib/lucene-highlighter-3.6.0.jar new file mode 100644 index 000000000..f07b95df4 Binary files /dev/null and b/lib/lucene-highlighter-3.6.0.jar differ diff --git a/lib/lucene-phonetic-3.6.0.jar b/lib/lucene-phonetic-3.6.0.jar new file mode 100644 index 000000000..c10979b07 Binary files /dev/null and b/lib/lucene-phonetic-3.6.0.jar differ diff --git a/lib/lucene-spatial-3.6.0.jar b/lib/lucene-spatial-3.6.0.jar new file mode 100644 index 000000000..c94aaad8c Binary files /dev/null and b/lib/lucene-spatial-3.6.0.jar differ diff --git a/lib/lucene-spellchecker-3.6.0.jar b/lib/lucene-spellchecker-3.6.0.jar new file mode 100644 index 000000000..3cbc48909 Binary files /dev/null and b/lib/lucene-spellchecker-3.6.0.jar differ diff --git a/lib/pdfbox-1.6.License b/lib/pdfbox-1.7.0.License similarity index 100% rename from lib/pdfbox-1.6.License rename to lib/pdfbox-1.7.0.License diff --git a/lib/pdfbox-1.6.0.jar b/lib/pdfbox-1.7.0.jar similarity index 85% rename from lib/pdfbox-1.6.0.jar rename to lib/pdfbox-1.7.0.jar index 75efe034f..9198f1047 100644 Binary files a/lib/pdfbox-1.6.0.jar and b/lib/pdfbox-1.7.0.jar differ diff --git a/lib/servlet-api-2.5-20081211.jar b/lib/servlet-api-2.5-20081211.jar new file mode 100644 index 000000000..b0537c4db Binary files /dev/null and b/lib/servlet-api-2.5-20081211.jar differ diff --git a/lib/servlet-api-LICENSE-ASL.txt b/lib/servlet-api-LICENSE-ASL.txt new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/lib/servlet-api-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lib/servlet-api.jar b/lib/servlet-api.jar deleted file mode 100644 index 308fd7a38..000000000 Binary files a/lib/servlet-api.jar and /dev/null differ diff --git a/nbproject/project.xml b/nbproject/project.xml index a2a0d4939..9ea9f3bb2 100644 --- a/nbproject/project.xml +++ b/nbproject/project.xml @@ -77,7 +77,7 @@ source htroot - lib/activation.jar:lib/apache-mime4j-0.6.jar:lib/apache-solr-solrj-3.6.0.jar:lib/arq-2.8.7.jar:lib/bcmail-jdk15-145.jar:lib/bcprov-jdk15-145.jar:lib/commons-compress-1.4.1.jar:lib/commons-codec-1.6.jar:lib/commons-fileupload-1.2.2.jar:lib/commons-httpclient-3.1.jar:lib/commons-io-2.1.jar:lib/commons-jxpath-1.3.jar:lib/commons-logging-1.1.1.jar:lib/fontbox-1.6.0.jar:lib/geronimo-stax-api_1.0_spec-1.0.1.jar:lib/htmllexer.jar:lib/htmlparser.jar:lib/httpclient-4.2.jar:lib/httpcore-4.2.jar:lib/httpmime-4.2.jar:lib/icu4j-core.jar:lib/iri-0.8.jar:lib/J7Zip-modified.jar:lib/jakarta-oro-2.0.8.jar:lib/jcifs-1.3.15.jar:lib/jcl-over-slf4j-1.6.1.jar:lib/jempbox-1.6.0.jar:lib/jena-2.6.4.jar:lib/jsch-0.1.42.jar:lib/json-simple-1.1.jar:lib/log4j-1.2.16.jar:lib/metadata-extractor-2.4.0-beta-1.jar:lib/mysql-connector-java-5.1.12-bin.jar:lib/pdfbox-1.6.0.jar:lib/poi-3.6-20091214.jar:lib/poi-scratchpad-3.6-20091214.jar:lib/sax-2.0.1.jar:lib/servlet-api.jar:lib/slf4j-api-1.6.1.jar:lib/slf4j-jdk14-1.6.1.jar:lib/webcat-0.1-swf.jar:lib/wstx-asl-3.2.7.jar:lib/xercesImpl.jar:lib/xml-apis.jar:htroot/processing/domaingraph/applet/domaingraph.jar + lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/apache-solr-core-3.6.0.jar;lib/apache-solr-solrj-3.6.0.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.6.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.1.jar;lib/fontbox-1.7.0.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-r05.jar;lib/htmllexer.jar;lib/htmlparser.jar;lib/httpclient-4.2.jar;lib/httpcore-4.2.jar;lib/httpmime-4.2.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.6.1.jar;lib/jempbox-1.7.0.jar;lib/jena-2.6.4.jar;lib/jetty-6.1.26-patched-JETTY-1340.jar;lib/jetty-util-6.1.26-patched-JETTY-1340.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/log4j-1.2.16.jar;lib/log4j-over-slf4j-1.6.1.jar;lib/lucene-analyzers-3.6.0.jar;lib/lucene-core-3.6.0.jar;lib/lucene-highlighter-3.6.0.jar;lib/lucene-phonetic-3.6.0.jar;lib/lucene-spatial-3.6.0.jar;lib/lucene-spellchecker-3.6.0.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.0.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.6.1.jar;lib/slf4j-jdk14-1.6.1.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar 1.6 diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index b591ce228..f0a5f21dd 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -41,6 +41,7 @@ import java.util.concurrent.ConcurrentMap; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.CloneableIterator; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -59,7 +60,6 @@ public class Balancer { private static final String indexSuffix = "A.db"; private static final int EcoFSBufferSize = 1000; private static final int objectIndexBufferSize = 1000; - private static final String localhost = "localhost"; // class variables filled with external values private final File cacheStacksPath; @@ -330,7 +330,7 @@ public class Balancer { private void pushHashToDomainStacks(String host, final byte[] urlhash) throws RowSpaceExceededException { // extend domain stack - if (host == null) host = localhost; + if (host == null) host = Domains.LOCALHOST; HandleSet domainList = this.domainStacks.get(host); if (domainList == null) { // create new list @@ -345,7 +345,7 @@ public class Balancer { private void removeHashFromDomainStacks(String host, final byte[] urlhash) { // reduce domain stack - if (host == null) host = localhost; + if (host == null) host = Domains.LOCALHOST; final HandleSet domainList = this.domainStacks.get(host); if (domainList == null) { this.domainStacks.remove(host); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index b11859dbc..319854f13 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -82,8 +82,8 @@ public class CrawlQueues { this.log.logConfig("Starting Crawling Management"); this.noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727); FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME)); - this.errorURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), sb.solrScheme, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); - this.delegatedURL = new ZURL(sb.indexSegments.segment(PROCESS).getSolr(), sb.solrScheme, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); + this.errorURL = new ZURL(sb.indexSegments.segment(PROCESS).getRemoteSolr(), sb.solrScheme, queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); + this.delegatedURL = new ZURL(sb.indexSegments.segment(PROCESS).getRemoteSolr(), sb.solrScheme, queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); } public void relocate(final File newQueuePath) { @@ -94,8 +94,8 @@ public class CrawlQueues { this.noticeURL = new NoticedURL(newQueuePath, this.sb.peers.myBotIDs(), this.sb.useTailCache, this.sb.exceed134217727); FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME)); - this.errorURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), this.sb.solrScheme, newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727); - this.delegatedURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getSolr(), this.sb.solrScheme, newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727); + this.errorURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getRemoteSolr(), this.sb.solrScheme, newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727); + this.delegatedURL = new ZURL(this.sb.indexSegments.segment(PROCESS).getRemoteSolr(), this.sb.solrScheme, newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727); } public synchronized void close() { diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 64c0f4535..3d4c55beb 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -333,7 +333,7 @@ public class RobotsTxt { ByteCount.addAccountCount(ByteCount.CRAWLER, robotsTxt.length); } final int code = client.getHttpResponse().getStatusLine().getStatusCode(); - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); // check the response status if (code > 199 && code < 300) { diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index eb7a2631c..603e517d5 100644 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -38,7 +38,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.cora.services.federated.solr.SolrDoc; -import net.yacy.cora.services.federated.solr.SolrShardingConnector; +import net.yacy.cora.services.federated.solr.ShardSolrConnector; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.Index; @@ -114,7 +114,7 @@ public class ZURL implements Iterable { this.stack = new LinkedBlockingQueue(); } - public ZURL(final SolrShardingConnector solrConnector, + public ZURL(final ShardSolrConnector solrConnector, final SolrConfiguration solrConfiguration) { this.solrConnector = solrConnector; this.solrConfiguration = solrConfiguration; diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 202c87034..aca20dfad 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -124,7 +124,7 @@ public class FTPLoader { if (dirList == null) { response = null; } else { - final ResponseHeader responseHeader = new ResponseHeader(); + final ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); @@ -132,7 +132,6 @@ public class FTPLoader { request, requestHeader, responseHeader, - "200", profile, false, dirList.toString().getBytes()); @@ -226,7 +225,7 @@ public class FTPLoader { final DigestURI refurl = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); if (refurl != null) requestHeader.put(RequestHeader.REFERER, refurl.toNormalform(true, false)); } - final ResponseHeader responseHeader = new ResponseHeader(); + final ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(fileDate)); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); @@ -252,7 +251,6 @@ public class FTPLoader { request, requestHeader, responseHeader, - "200", profile, false, null); @@ -268,7 +266,6 @@ public class FTPLoader { request, requestHeader, responseHeader, - "200", profile, false, b); diff --git a/source/de/anomic/crawler/retrieval/FileLoader.java b/source/de/anomic/crawler/retrieval/FileLoader.java index 87451c169..a989d11a0 100644 --- a/source/de/anomic/crawler/retrieval/FileLoader.java +++ b/source/de/anomic/crawler/retrieval/FileLoader.java @@ -85,7 +85,7 @@ public class FileLoader { StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true); - ResponseHeader responseHeader = new ResponseHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); @@ -93,7 +93,6 @@ public class FileLoader { request, requestHeader, responseHeader, - "200", profile, false, content.toString().getBytes()); @@ -103,7 +102,7 @@ public class FileLoader { // create response header String mime = Classification.ext2mime(url.getFileExtension()); - ResponseHeader responseHeader = new ResponseHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); @@ -134,7 +133,6 @@ public class FileLoader { request, requestHeader, responseHeader, - "200", profile, false, url.toTokens().getBytes()); @@ -152,7 +150,6 @@ public class FileLoader { request, requestHeader, responseHeader, - "200", profile, false, b); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 6d8726fbe..57b71434a 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -80,8 +80,8 @@ public final class HTTPLoader { private Response load(final Request request, final int retryCount, final int maxFileSize, final boolean checkBlacklist) throws IOException { if (retryCount < 0) { - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection counter exceeded", -1); - throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted."); + this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); + throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted."); } DigestURI url = request.url(); @@ -131,15 +131,15 @@ public final class HTTPLoader { // send request final byte[] responseBody = client.GETbytes(url, maxFileSize); - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - final int code = client.getHttpResponse().getStatusLine().getStatusCode(); + final int code = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); if (code > 299 && code < 310) { // redirection (content may be empty) if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { - if (header.containsKey(HeaderFramework.LOCATION)) { + if (responseHeader.containsKey(HeaderFramework.LOCATION)) { // getting redirection URL - String redirectionUrlString = header.get(HeaderFramework.LOCATION); + String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); redirectionUrlString = redirectionUrlString.trim(); if (redirectionUrlString.length() == 0) { @@ -202,8 +202,7 @@ public final class HTTPLoader { response = new Response( request, requestHeader, - header, - Integer.toString(code), + responseHeader, profile, false, responseBody @@ -254,8 +253,8 @@ public final class HTTPLoader { client.setTimout(20000); client.setHeader(requestHeader.entrySet()); final byte[] responseBody = client.GETbytes(request.url()); - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - final int code = client.getHttpResponse().getStatusLine().getStatusCode(); + final int code = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); // FIXME: 30*-handling (bottom) is never reached // we always get the final content because httpClient.followRedirects = true @@ -272,7 +271,6 @@ public final class HTTPLoader { request, requestHeader, header, - Integer.toString(code), null, false, responseBody diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 34715a520..dec791021 100644 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -63,7 +63,6 @@ public class Response { private final Request request; private final RequestHeader requestHeader; private final ResponseHeader responseHeader; - private final String responseStatus; private final CrawlProfile profile; private byte[] content; private int status; // tracker indexing status, see status defs below @@ -151,7 +150,6 @@ public class Response { final Request request, final RequestHeader requestHeader, final ResponseHeader responseHeader, - final String responseStatus, final CrawlProfile profile, final boolean fromCache, final byte[] content) { @@ -159,7 +157,6 @@ public class Response { // request and response headers may be zero in case that we process surrogates this.requestHeader = requestHeader; this.responseHeader = responseHeader; - this.responseStatus = responseStatus; this.profile = profile; this.status = QUEUE_STATE_FRESH; this.content = content; @@ -176,10 +173,9 @@ public class Response { this.request = request; // request and response headers may be zero in case that we process surrogates this.requestHeader = new RequestHeader(); - this.responseHeader = new ResponseHeader(); + this.responseHeader = new ResponseHeader(200); this.responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); // tell parser how to handle the content if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size())); - this.responseStatus = "200"; this.profile = profile; this.status = QUEUE_STATE_FRESH; this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes(); @@ -190,10 +186,9 @@ public class Response { final Request request, final RequestHeader requestHeader, final ResponseHeader responseHeader, - final String responseStatus, final CrawlProfile profile, final boolean fromCache) { - this(request, requestHeader, responseHeader, responseStatus, profile, fromCache, null); + this(request, requestHeader, responseHeader, profile, fromCache, null); } public void updateStatus(final int newStatus) { @@ -371,7 +366,7 @@ public class Response { // check status code if (!validResponseStatus()) { - return "bad_status_" + this.responseStatus; + return "bad_status_" + this.responseHeader.getStatusCode(); } if (this.requestHeader != null) { @@ -796,7 +791,8 @@ public class Response { } public boolean validResponseStatus() { - return (this.responseStatus == null) ? false : this.responseStatus.startsWith("200") || this.responseStatus.startsWith("203"); + int status = this.responseHeader.getStatusCode(); + return status == 200 || status == 203; } public Date ifModifiedSince() { diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java index e968263be..0726aabfe 100644 --- a/source/de/anomic/crawler/retrieval/SMBLoader.java +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -104,7 +104,7 @@ public class SMBLoader { StringBuilder content = FTPClient.dirhtml(u, null, null, null, list, true); - ResponseHeader responseHeader = new ResponseHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes()); @@ -112,7 +112,6 @@ public class SMBLoader { request, requestHeader, responseHeader, - "200", profile, false, content.toString().getBytes()); @@ -122,7 +121,7 @@ public class SMBLoader { // create response header String mime = Classification.ext2mime(url.getFileExtension()); - ResponseHeader responseHeader = new ResponseHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); @@ -153,7 +152,6 @@ public class SMBLoader { request, requestHeader, responseHeader, - "200", profile, false, url.toTokens().getBytes()); @@ -171,7 +169,6 @@ public class SMBLoader { request, requestHeader, responseHeader, - "200", profile, false, b); diff --git a/source/de/anomic/data/BlogBoard.java b/source/de/anomic/data/BlogBoard.java index 13dd200e0..f75d03465 100644 --- a/source/de/anomic/data/BlogBoard.java +++ b/source/de/anomic/data/BlogBoard.java @@ -1,4 +1,4 @@ -// BlogBoard.java +// BlogBoard.java // ------------------------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de @@ -38,7 +38,9 @@ import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeSet; import javax.xml.parsers.DocumentBuilder; @@ -47,6 +49,7 @@ import javax.xml.parsers.ParserConfigurationException; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.Domains; import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; @@ -60,50 +63,48 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import de.anomic.data.wiki.WikiBoard; -import java.util.List; -import java.util.Set; public class BlogBoard { - + private static final int KEY_LENGTH = 64; - + private MapHeap database = null; - + public BlogBoard(final File actpath) throws IOException { new File(actpath.getParent()).mkdir(); //database = new MapView(BLOBTree.toHeap(actpath, true, true, keyLength, recordSize, '_', NaturalOrder.naturalOrder, newFile), 500, '_'); - database = new MapHeap(actpath, KEY_LENGTH, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); + this.database = new MapHeap(actpath, KEY_LENGTH, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); } - + public int size() { - return database.size(); + return this.database.size(); } - + /** * Tells if the database contains an element. * @param key the ID of the element * @return true if the database contains the element, else false */ public boolean contains(final String key) { - return database.containsKey(UTF8.getBytes(key)); + return this.database.containsKey(UTF8.getBytes(key)); } - + public synchronized void close() { - database.close(); + this.database.close(); } - + private static String normalize(final String key) { return (key == null) ? "null" : key.trim().toLowerCase(); } - + public static String webalize(final String key) { return (key == null) ? "null": key.trim().toLowerCase().replaceAll(" ", "%20"); } - + public String guessAuthor(final String ip) { return WikiBoard.guessAuthor(ip); } - + /** * Create a new BlogEntry and return it * @param key @@ -126,7 +127,7 @@ public class BlogBoard { public String writeBlogEntry(final BlogEntry page) { String ret = null; try { - database.insert(UTF8.getBytes(page.key), page.record); + this.database.insert(UTF8.getBytes(page.key), page.record); ret = page.key; } catch (IOException ex) { Log.logException(ex); @@ -135,11 +136,11 @@ public class BlogBoard { } return ret; } - + public BlogEntry readBlogEntry(final String key) { - return readBlogEntry(key, database); + return readBlogEntry(key, this.database); } - + private BlogEntry readBlogEntry(final String key, final MapHeap base) { final String normalized = normalize(key); Map record; @@ -153,10 +154,10 @@ public class BlogBoard { record = null; } return (record == null) ? - newEntry(key, new byte[0], UTF8.getBytes("anonymous"), "127.0.0.1", new Date(), new byte[0], null, null) : + newEntry(key, new byte[0], UTF8.getBytes("anonymous"), Domains.LOCALHOST, new Date(), new byte[0], null, null) : new BlogEntry(key, record); } - + public boolean importXML(final String input) { final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); try { @@ -169,20 +170,20 @@ public class BlogBoard { } catch (final IOException ex) { Log.logException(ex); } - + return false; } - + private boolean parseXMLimport(final Document doc) { if(!"blog".equals(doc.getDocumentElement().getTagName())) { return false; } - + final NodeList items = doc.getDocumentElement().getElementsByTagName("item"); if(items.getLength() == 0) { return false; } - + for (int i = 0, n = items.getLength(); i < n; ++i) { String key = null, ip = null, StrSubject = null, StrAuthor = null, StrPage = null, StrDate = null; Date date = null; @@ -207,7 +208,7 @@ public class BlogBoard { StrPage = currentNode.getFirstChild().getNodeValue(); } } - + try { date = GenericFormatter.SHORT_SECOND_FORMATTER.parse(StrDate); } catch (final ParseException e1) { @@ -226,31 +227,32 @@ public class BlogBoard { } return true; } - + public void deleteBlogEntry(final String key) { try { - database.delete(UTF8.getBytes(normalize(key))); + this.database.delete(UTF8.getBytes(normalize(key))); } catch (final IOException e) { } } - + public Iterator keys(final boolean up) throws IOException { - return database.keys(up, false); + return this.database.keys(up, false); } - + /** * Comparator to sort objects of type Blog according to their timestamps */ public class BlogComparator implements Comparator { - + private final boolean newestFirst; - + /** * @param newestFirst newest first, or oldest first? */ public BlogComparator(final boolean newestFirst){ this.newestFirst = newestFirst; } - + + @Override public int compare(final String obj1, final String obj2) { final BlogEntry blogEntry1 = readBlogEntry(obj1); final BlogEntry blogEntry2 = readBlogEntry(obj2); @@ -268,7 +270,7 @@ public class BlogBoard { return -1; } } - + public Iterator getBlogIterator(final boolean priv){ final Set set = new TreeSet(new BlogComparator(true)); final Iterator iterator = blogIterator(true); @@ -281,7 +283,7 @@ public class BlogBoard { } return set.iterator(); } - + public Iterator blogIterator(final boolean up){ try { return new BlogIterator(up); @@ -289,7 +291,7 @@ public class BlogBoard { return new HashSet().iterator(); } } - + /** * Subclass of blogBoard, which provides the blogIterator object-type */ @@ -300,7 +302,8 @@ public class BlogBoard { this.blogIter = BlogBoard.this.database.keys(up, false); //this.nextEntry = null; } - + + @Override public boolean hasNext() { try { return this.blogIter.hasNext(); @@ -309,7 +312,8 @@ public class BlogBoard { return false; } } - + + @Override public BlogEntry next() { try { return readBlogEntry(UTF8.String(this.blogIter.next())); @@ -318,7 +322,8 @@ public class BlogBoard { return null; } } - + + @Override public void remove() { // if (this.nextEntry != null) { // try { @@ -331,14 +336,14 @@ public class BlogBoard { throw new UnsupportedOperationException("Method not implemented yet."); } } - + public class BlogEntry { - + String key; Map record; - + public BlogEntry(final String nkey, final byte[] subject, final byte[] author, final String ip, final Date date, final byte[] page, final List comments, final String commentMode) { - record = new HashMap(); + this.record = new HashMap(); setKey(nkey); setDate(date); setSubject(subject); @@ -347,13 +352,13 @@ public class BlogBoard { setPage(page); setComments(comments); setCommentMode(commentMode); - + // TODO: implement this function - record.put("privacy", "public"); - + this.record.put("privacy", "public"); + WikiBoard.setAuthor(ip, UTF8.String(author)); } - + BlogEntry(final String key, final Map record) { this.key = key; this.record = record; @@ -364,35 +369,35 @@ public class BlogBoard { this.record.put("commentMode", "2"); } } - + private void setKey(final String key) { this.key = key.substring(0, Math.min(key.length(), KEY_LENGTH)); } - + public String getKey() { - return key; + return this.key; } - + public byte[] getSubject() { - final String m = record.get("subject"); + final String m = this.record.get("subject"); if (m == null) { return new byte[0]; } final byte[] b = Base64Order.enhancedCoder.decode(m); return (b == null) ? new byte[0] : b; } - + private void setSubject(final byte[] subject) { if (subject == null) { - record.put("subject",""); + this.record.put("subject",""); } else { - record.put("subject", Base64Order.enhancedCoder.encode(subject)); + this.record.put("subject", Base64Order.enhancedCoder.encode(subject)); } } - + public Date getDate() { try { - final String date = record.get("date"); + final String date = this.record.get("date"); if (date == null) { if (Log.isFinest("Blog")) { Log.logFinest("Blog", "ERROR: date field missing in blogBoard"); @@ -404,17 +409,17 @@ public class BlogBoard { return new Date(); } } - + private void setDate(final Date date) { Date ret = date; if (ret == null) { ret = new Date(); } - record.put("date", GenericFormatter.SHORT_SECOND_FORMATTER.format(ret)); + this.record.put("date", GenericFormatter.SHORT_SECOND_FORMATTER.format(ret)); } - + public String getTimestamp() { - final String timestamp = record.get("date"); + final String timestamp = this.record.get("date"); if (timestamp == null) { if (Log.isFinest("Blog")) { Log.logFinest("Blog", "ERROR: date field missing in blogBoard"); @@ -423,87 +428,87 @@ public class BlogBoard { } return timestamp; } - + public byte[] getAuthor() { - final String author = record.get("author"); + final String author = this.record.get("author"); if (author == null) { return new byte[0]; } final byte[] b = Base64Order.enhancedCoder.decode(author); return (b == null) ? new byte[0] : b; } - + private void setAuthor(final byte[] author) { if (author == null) - record.put("author",""); + this.record.put("author",""); else - record.put("author", Base64Order.enhancedCoder.encode(author)); + this.record.put("author", Base64Order.enhancedCoder.encode(author)); } - + public int getCommentsSize() { // This ist a Bugfix for Version older than 4443. - if (record.get("comments").startsWith(",")) { - record.put("comments", record.get("comments").substring(1)); + if (this.record.get("comments").startsWith(",")) { + this.record.put("comments", this.record.get("comments").substring(1)); writeBlogEntry(this); } - final List commentsize = ListManager.string2arraylist(record.get("comments")); + final List commentsize = ListManager.string2arraylist(this.record.get("comments")); return commentsize.size(); } - + public List getComments() { - return ListManager.string2arraylist(record.get("comments")); + return ListManager.string2arraylist(this.record.get("comments")); } - + private void setComments(final List comments) { if (comments == null) { - record.put("comments", ListManager.collection2string(new ArrayList())); + this.record.put("comments", ListManager.collection2string(new ArrayList())); } else { - record.put("comments", ListManager.collection2string(comments)); + this.record.put("comments", ListManager.collection2string(comments)); } } - + public String getIp() { - final String ip = record.get("ip"); - return (ip == null) ? "127.0.0.1" : ip; + final String ip = this.record.get("ip"); + return (ip == null) ? Domains.LOCALHOST : ip; } - + private void setIp(final String ip) { String ret = ip; if ((ret == null) || (ret.length() == 0)) ret = ""; - record.put("ip", ret); + this.record.put("ip", ret); } - + public byte[] getPage() { - final String page = record.get("page"); + final String page = this.record.get("page"); if (page == null) { return new byte[0]; } final byte[] page_as_byte = Base64Order.enhancedCoder.decode(page); return (page_as_byte == null) ? new byte[0] : page_as_byte; - } - + } + private void setPage(final byte[] page) { if (page == null) { - record.put("page", ""); + this.record.put("page", ""); } else { - record.put("page", Base64Order.enhancedCoder.encode(page)); + this.record.put("page", Base64Order.enhancedCoder.encode(page)); } } - + public void addComment(final String commentID) { - final List comments = ListManager.string2arraylist(record.get("comments")); + final List comments = ListManager.string2arraylist(this.record.get("comments")); comments.add(commentID); - record.put("comments", ListManager.collection2string(comments)); + this.record.put("comments", ListManager.collection2string(comments)); } - + public boolean removeComment(final String commentID) { - final List comments = ListManager.string2arraylist(record.get("comments")); + final List comments = ListManager.string2arraylist(this.record.get("comments")); final boolean success = comments.remove(commentID); - record.put("comments", ListManager.collection2string(comments)); + this.record.put("comments", ListManager.collection2string(comments)); return success; } - + /** * returns the comment mode * 0 - no comments allowed @@ -512,21 +517,21 @@ public class BlogBoard { * @return comment mode */ public int getCommentMode(){ - return Integer.parseInt(record.get("commentMode")); + return Integer.parseInt(this.record.get("commentMode")); } - + private void setCommentMode(final String mode) { if (mode == null) { - record.put("commentMode", "2"); + this.record.put("commentMode", "2"); } else { - record.put("commentMode", mode); + this.record.put("commentMode", mode); } } - + public boolean isPublic() { - final String privacy = record.get("privacy"); + final String privacy = this.record.get("privacy"); return (privacy == null || privacy.equalsIgnoreCase("public")) ? true : false; } - + } } diff --git a/source/de/anomic/data/BlogBoardComments.java b/source/de/anomic/data/BlogBoardComments.java index 690c053bd..02347e5cb 100644 --- a/source/de/anomic/data/BlogBoardComments.java +++ b/source/de/anomic/data/BlogBoardComments.java @@ -46,6 +46,7 @@ import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.Domains; import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; @@ -142,7 +143,7 @@ public class BlogBoardComments { record = null; } return (record == null) ? - newEntry(copyOfKey, new byte[0], UTF8.getBytes("anonymous"), "127.0.0.1", new Date(), new byte[0]) : + newEntry(copyOfKey, new byte[0], UTF8.getBytes("anonymous"), Domains.LOCALHOST, new Date(), new byte[0]) : new CommentEntry(copyOfKey, record); } @@ -326,7 +327,7 @@ public class BlogBoardComments { public String getIp() { final String ip = this.record.get("ip"); if (ip == null) - return "127.0.0.1"; + return Domains.LOCALHOST; return ip; } private void setPage(final byte[] page) { diff --git a/source/de/anomic/data/wiki/WikiBoard.java b/source/de/anomic/data/wiki/WikiBoard.java index ffffc79ec..9074a2b00 100644 --- a/source/de/anomic/data/wiki/WikiBoard.java +++ b/source/de/anomic/data/wiki/WikiBoard.java @@ -1,4 +1,4 @@ -//wikiBoard.java +//wikiBoard.java //------------------------------------- //(C) by Michael Peter Christen; mc@yacy.net //first published on http://www.anomic.de @@ -37,6 +37,7 @@ import java.util.TimeZone; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.Domains; import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; @@ -44,7 +45,7 @@ import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.NaturalOrder; /** - * + * */ public class WikiBoard { @@ -70,14 +71,14 @@ public class WikiBoard { */ public WikiBoard(final File actpath, final File bkppath) throws IOException { new File(actpath.getParent()).mkdirs(); - if (datbase == null) { + if (this.datbase == null) { //datbase = new MapView(BLOBTree.toHeap(actpath, true, true, keyLength, recordSize, '_', NaturalOrder.naturalOrder, actpathNew), 500, '_'); - datbase = new MapHeap(actpath, keyLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); + this.datbase = new MapHeap(actpath, keyLength, NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); } new File(bkppath.getParent()).mkdirs(); - if (bkpbase == null) { + if (this.bkpbase == null) { //bkpbase = new MapView(BLOBTree.toHeap(bkppath, true, true, keyLength + dateFormat.length(), recordSize, '_', NaturalOrder.naturalOrder, bkppathNew), 500, '_'); - bkpbase = new MapHeap(bkppath, keyLength + DATE_FORMAT.length(), NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); + this.bkpbase = new MapHeap(bkppath, keyLength + DATE_FORMAT.length(), NaturalOrder.naturalOrder, 1024 * 64, 500, '_'); } } @@ -86,7 +87,7 @@ public class WikiBoard { * @return number of entries in wiki plus number of old entries. */ public int sizeOfTwo() { - return datbase.size() + bkpbase.size(); + return this.datbase.size() + this.bkpbase.size(); } /** @@ -94,15 +95,15 @@ public class WikiBoard { * @return number of entries in wiki. */ public int size() { - return datbase.size(); + return this.datbase.size(); } /** * Closes database files. */ public synchronized void close() { - datbase.close(); - bkpbase.close(); + this.datbase.close(); + this.bkpbase.close(); } /** @@ -182,7 +183,7 @@ public class WikiBoard { public class Entry { private static final String ANONYMOUS = "anonymous"; - private String key; + private final String key; private final Map record; /** @@ -220,7 +221,7 @@ public class WikiBoard { * @return subject of entry. */ public String subject() { - return key; + return this.key; } /** @@ -230,7 +231,7 @@ public class WikiBoard { public Date date() { Date ret; try { - final String c = record.get("date"); + final String c = this.record.get("date"); if (c == null) { System.out.println("DEBUG - ERROR: date field missing in wikiBoard"); ret = new Date(); @@ -250,7 +251,7 @@ public class WikiBoard { * @return author of Entry. */ public String author() { - final String a = record.get("author"); + final String a = this.record.get("author"); final byte[] b; return (a != null && (b = Base64Order.enhancedCoder.decode(a)) != null) ? UTF8.String(b) : ANONYMOUS; } @@ -261,7 +262,7 @@ public class WikiBoard { */ public String reason() { final String ret; - final String r = record.get("reason"); + final String r = this.record.get("reason"); if (r != null) { final byte[] b; ret = ((b = Base64Order.enhancedCoder.decode(r)) != null) ? UTF8.String(b) : "unknown"; @@ -276,7 +277,7 @@ public class WikiBoard { * @return content of Entry. */ public byte[] page() { - final String m = record.get("page"); + final String m = this.record.get("page"); final byte[] b; return (m != null && (b = Base64Order.enhancedCoder.decode(m)) != null) ? b : new byte[0]; } @@ -286,7 +287,7 @@ public class WikiBoard { * @param date date of previous version of Entry. */ void setAncestorDate(final Date date) { - record.put("bkp", dateString(date)); + this.record.put("bkp", dateString(date)); } /** @@ -296,7 +297,7 @@ public class WikiBoard { private Date getAncestorDate() { Date ret = null; try { - final String c = record.get("date"); + final String c = this.record.get("date"); if (c != null) { synchronized (SimpleFormatter) { ret = SimpleFormatter.parse(c); @@ -314,7 +315,7 @@ public class WikiBoard { */ public Entry getAncestor() { final Date ancDate = getAncestorDate(); - return (ancDate == null) ? null : read(key + dateString(ancDate), bkpbase); + return (ancDate == null) ? null : read(this.key + dateString(ancDate), WikiBoard.this.bkpbase); } /** @@ -322,7 +323,7 @@ public class WikiBoard { * @param subject subject of child of current Entry. */ void setChild(final String subject) { - record.put("child", Base64Order.enhancedCoder.encode(UTF8.getBytes(subject))); + this.record.put("child", Base64Order.enhancedCoder.encode(UTF8.getBytes(subject))); } /** @@ -330,7 +331,7 @@ public class WikiBoard { * @return name of child of this Entry. */ private String getChildName() { - final String c = record.get("child"); + final String c = this.record.get("child"); final byte[] subject; return (c != null && (subject = Base64Order.enhancedCoder.decode(c)) != null) ? ASCII.String(subject) : null; } @@ -340,7 +341,7 @@ public class WikiBoard { * @return true if has child, else false. */ public boolean hasChild() { - final String c = record.get("child"); + final String c = this.record.get("child"); return (c != null && Base64Order.enhancedCoder.decode(c) != null) ? true : false; } @@ -350,7 +351,7 @@ public class WikiBoard { */ public Entry getChild() { final String childName = getChildName(); - return (childName == null) ? null : read(childName, datbase); + return (childName == null) ? null : read(childName, WikiBoard.this.datbase); } } @@ -370,9 +371,9 @@ public class WikiBoard { entry.setAncestorDate(oldDate); oldEntry.setChild(entry.subject()); // write the backup - bkpbase.insert(UTF8.getBytes(entry.key + dateString(oldDate)), oldEntry.record); + this.bkpbase.insert(UTF8.getBytes(entry.key + dateString(oldDate)), oldEntry.record); // write the new page - datbase.insert(UTF8.getBytes(entry.key), entry.record); + this.datbase.insert(UTF8.getBytes(entry.key), entry.record); key = entry.key; } catch (final Exception e) { Log.logException(e); @@ -386,7 +387,7 @@ public class WikiBoard { * @return Entry which contains data. */ public Entry read(final String key) { - return read(key, datbase); + return read(key, this.datbase); } /** @@ -403,7 +404,7 @@ public class WikiBoard { copyOfKey = copyOfKey.substring(0, keyLength); } final Map record = base.get(UTF8.getBytes(copyOfKey)); - ret = (record == null) ? newEntry(copyOfKey, ANONYMOUS, "127.0.0.1", "New Page", UTF8.getBytes("")) : new Entry(copyOfKey, record); + ret = (record == null) ? newEntry(copyOfKey, ANONYMOUS, Domains.LOCALHOST, "New Page", UTF8.getBytes("")) : new Entry(copyOfKey, record); } catch (final IOException e) { Log.logException(e); } catch (RowSpaceExceededException e) { @@ -418,7 +419,7 @@ public class WikiBoard { * @return the Entry. */ public Entry readBkp(final String key) { - return read(key, bkpbase); + return read(key, this.bkpbase); } /** @@ -428,7 +429,7 @@ public class WikiBoard { * @throws IOException */ public Iterator keys(final boolean up) throws IOException { - return datbase.keys(up, false); + return this.datbase.keys(up, false); } /** @@ -438,6 +439,6 @@ public class WikiBoard { * @throws IOException */ public Iterator keysBkp(final boolean up) throws IOException { - return bkpbase.keys(up, false); + return this.bkpbase.keys(up, false); } } diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index f0961effb..5c40e5b7e 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -227,7 +227,7 @@ public final class HTTPDFileHandler { } private static final ResponseHeader getDefaultHeaders(final String path) { - final ResponseHeader headers = new ResponseHeader(); + final ResponseHeader headers = new ResponseHeader(200); String ext; int pos; if ((pos = path.lastIndexOf('.')) < 0) { @@ -526,7 +526,7 @@ public final class HTTPDFileHandler { aBuffer.append(" \n\n\n"); // write the list to the client - HTTPDemon.sendRespondHeader(conProp, out, httpVersion, 200, null, "text/html; charset=UTF-8", aBuffer.length(), new Date(targetFile.lastModified()), null, new ResponseHeader(), null, null, true); + HTTPDemon.sendRespondHeader(conProp, out, httpVersion, 200, null, "text/html; charset=UTF-8", aBuffer.length(), new Date(targetFile.lastModified()), null, new ResponseHeader(200), null, null, true); if (!method.equals(HeaderFramework.METHOD_HEAD)) { out.write(UTF8.getBytes(aBuffer.toString())); } @@ -546,7 +546,7 @@ public final class HTTPDFileHandler { // implement proxy via url (not in servlet, because we need binary access on ouputStream) if (path.equals("/proxy.html")) { - final List urlProxyAccess = Domains.makePatterns(sb.getConfig("proxyURL.access", "127.0.0.1")); + final List urlProxyAccess = Domains.makePatterns(sb.getConfig("proxyURL.access", Domains.LOCALHOST)); final UserDB.Entry user = sb.userDB.getUser(requestHeader); final boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT)); if (sb.getConfigBool("proxyURL", false) && user_may_see_proxyurl) { @@ -1075,9 +1075,10 @@ public final class HTTPDFileHandler { // apply templates TemplateEngine.writeTemplate(fis, o, templatePatterns, UNRESOLVED_PATTERN); fis.close(); + ResponseHeader rh = (templatePatterns == null) ? new ResponseHeader(200) : templatePatterns.getOutgoingHeader(); HTTPDemon.sendRespondHeader(conProp, out, - httpVersion, 200, null, mimeType, -1, - targetDate, expireDate, (templatePatterns == null) ? new ResponseHeader() : templatePatterns.getOutgoingHeader(), + httpVersion, rh.getStatusCode(), null, mimeType, -1, + targetDate, expireDate, rh, null, "chunked", nocache); // send the content in chunked parts, see RFC 2616 section 3.6.1 final ChunkedOutputStream chos = new ChunkedOutputStream(out); @@ -1107,16 +1108,17 @@ public final class HTTPDFileHandler { ServerSideIncludes.writeSSI(o1, o, realmProp, clientIP, requestHeader); //httpTemplate.writeTemplate(fis, o, tp, "-UNRESOLVED_PATTERN-".getBytes("UTF-8")); } + ResponseHeader rh = (templatePatterns == null) ? new ResponseHeader(200) : templatePatterns.getOutgoingHeader(); if (method.equals(HeaderFramework.METHOD_HEAD)) { HTTPDemon.sendRespondHeader(conProp, out, - httpVersion, 200, null, mimeType, o.length(), - targetDate, expireDate, (templatePatterns == null) ? new ResponseHeader() : templatePatterns.getOutgoingHeader(), + httpVersion, rh.getStatusCode(), null, mimeType, o.length(), + targetDate, expireDate, rh, contentEncoding, null, nocache); } else { final byte[] result = o.getBytes(); // this interrupts streaming (bad idea!) HTTPDemon.sendRespondHeader(conProp, out, - httpVersion, 200, null, mimeType, result.length, - targetDate, expireDate, (templatePatterns == null) ? new ResponseHeader() : templatePatterns.getOutgoingHeader(), + httpVersion, rh.getStatusCode(), null, mimeType, result.length, + targetDate, expireDate, rh, contentEncoding, null, nocache); FileUtils.copy(result, out); } @@ -1125,7 +1127,7 @@ public final class HTTPDFileHandler { int statusCode = 200; int rangeStartOffset = 0; - final ResponseHeader header = new ResponseHeader(); + final ResponseHeader header = new ResponseHeader(statusCode); // adding the accept ranges header header.put(HeaderFramework.ACCEPT_RANGES, "bytes"); @@ -1429,8 +1431,8 @@ public final class HTTPDFileHandler { String strARGS = (String) conProp.get("ARGS"); if(strARGS.startsWith("action=")) { int detectnextargument = strARGS.indexOf("&"); - action = strARGS.substring (7, detectnextargument); - strARGS = strARGS.substring(detectnextargument+1); + action = strARGS.substring (7, detectnextargument); + strARGS = strARGS.substring(detectnextargument+1); } if(strARGS.startsWith("url=")) { final String strUrl = strARGS.substring(4); // strip url= @@ -1467,7 +1469,7 @@ public final class HTTPDFileHandler { requestHeader.remove("Authorization"); requestHeader.remove("Connection"); requestHeader.put(HeaderFramework.HOST, proxyurl.getHost()); - + // temporarily add argument to header to pass it on to augmented browsing requestHeader.put("YACYACTION", action); @@ -1475,7 +1477,7 @@ public final class HTTPDFileHandler { HTTPDProxyHandler.doGet(prop, requestHeader, o); // reparse header to extract content-length and mimetype - final ResponseHeader outgoingHeader = new ResponseHeader(); + final ResponseHeader outgoingHeader = new ResponseHeader(200); final InputStream in = new ByteArrayInputStream(o.toByteArray()); String line = readLine(in); while(line != null && !line.equals("")) { diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 99c04062f..137fab708 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -403,7 +403,6 @@ public final class HTTPDProxyHandler { request, requestHeader, cachedResponseHeader, - "200 OK", sb.crawler.defaultProxyProfile, true ); @@ -495,20 +494,20 @@ public final class HTTPDProxyHandler { if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); conProp.put(HeaderFramework.CONNECTION_PROP_CLIENT_REQUEST_HEADER, requestHeader); - final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); // determine if it's an internal error of the httpc if (responseHeader.isEmpty()) { throw new Exception(client.getHttpResponse().getStatusLine().toString()); } - if(AugmentedHtmlStream.supportsMime(responseHeader.mime())) { + if (AugmentedHtmlStream.supportsMime(responseHeader.mime())) { // enable chunk encoding, because we don't know the length after annotating responseHeader.remove(HeaderFramework.CONTENT_LENGTH); responseHeader.put(HeaderFramework.TRANSFER_ENCODING, "chunked"); - } - ChunkedOutputStream chunkedOut = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), respond); + ChunkedOutputStream chunkedOut = setTransferEncoding(conProp, responseHeader, statusCode, respond); // the cache does either not exist or is (supposed to be) stale long sizeBeforeDelete = -1; @@ -558,7 +557,7 @@ public final class HTTPDProxyHandler { conProp, respond, httpVer, - client.getHttpResponse().getStatusLine().getStatusCode(), + statusCode, client.getHttpResponse().getStatusLine().toString(), // status text responseHeader); @@ -569,7 +568,6 @@ public final class HTTPDProxyHandler { request, requestHeader, responseHeader, - Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()), sb.crawler.defaultProxyProfile, true ); @@ -845,7 +843,8 @@ public final class HTTPDProxyHandler { // if (responseHeader.isEmpty()) { // throw new Exception(res.getStatusLine()); // } - final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); if (responseHeader.isEmpty()) { throw new Exception(client.getHttpResponse().getStatusLine().toString()); } @@ -860,7 +859,7 @@ public final class HTTPDProxyHandler { conProp, respond, httpVer, - client.getHttpResponse().getStatusLine().getStatusCode(), + statusCode, client.getHttpResponse().getStatusLine().toString(), responseHeader); respond.flush(); @@ -951,7 +950,8 @@ public final class HTTPDProxyHandler { client.POST(getUrl, body, contentLength); if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); - final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); // determine if it's an internal error of the httpc if (responseHeader.isEmpty()) { throw new Exception(client.getHttpResponse().getStatusLine().toString()); @@ -971,7 +971,7 @@ public final class HTTPDProxyHandler { HTTPDemon.sendRespondHeader(conProp, countedRespond, httpVer, - client.getHttpResponse().getStatusLine().getStatusCode(), + statusCode, client.getHttpResponse().getStatusLine().toString(), // status text responseHeader); @@ -1249,11 +1249,12 @@ public final class HTTPDProxyHandler { try { remoteProxy.HEADResponse("http://" + host + ":" + port); - final ResponseHeader header = new ResponseHeader(remoteProxy.getHttpResponse().getAllHeaders()); + int statusCode = remoteProxy.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader header = new ResponseHeader(statusCode, remoteProxy.getHttpResponse().getAllHeaders()); // outputs a logline to the serverlog with the current status log.logInfo("CONNECT-RESPONSE: status=" + remoteProxy.getHttpResponse().getStatusLine() + ", header=" + header.toString()); - final boolean success = remoteProxy.getHttpResponse().getStatusLine().getStatusCode() >= 200 && remoteProxy.getHttpResponse().getStatusLine().getStatusCode() <= 399; + final boolean success = statusCode >= 200 && statusCode <= 399; if (success) { // replace connection details host = ProxySettings.host; diff --git a/source/de/anomic/http/server/HTTPDemon.java b/source/de/anomic/http/server/HTTPDemon.java index b3d0a598d..6dc088eff 100644 --- a/source/de/anomic/http/server/HTTPDemon.java +++ b/source/de/anomic/http/server/HTTPDemon.java @@ -1071,7 +1071,7 @@ public final class HTTPDemon implements serverHandler, Cloneable { // set rewrite values final serverObjects tp = new serverObjects(); - String clientIP = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); if (clientIP == null) clientIP = "127.0.0.1"; + String clientIP = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); if (clientIP == null) clientIP = Domains.LOCALHOST; // check if ip is local ip address final InetAddress hostAddress = Domains.dnsResolve(clientIP); @@ -1136,9 +1136,7 @@ public final class HTTPDemon implements serverHandler, Cloneable { final byte[] result = o.toByteArray(); o.close(); o = null; - if(header == null) { - header = new ResponseHeader(); - } + if (header == null) header = new ResponseHeader(httpStatusCode); header.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_STATUS, Integer.toString(httpStatusCode)); header.put(HeaderFramework.DATE, systemDate); header.put(HeaderFramework.CONTENT_TYPE, "text/html"); @@ -1189,9 +1187,7 @@ public final class HTTPDemon implements serverHandler, Cloneable { } } - if (headers == null) { - headers = new ResponseHeader(); - } + if (headers == null) headers = new ResponseHeader(httpStatusCode); final Date now = new Date(System.currentTimeMillis()); headers.put(HeaderFramework.SERVER, "AnomicHTTPD (www.anomic.de)"); @@ -1240,7 +1236,7 @@ public final class HTTPDemon implements serverHandler, Cloneable { if (respond == null) throw new NullPointerException("The outputstream must not be null."); if (conProp == null) throw new NullPointerException("The connection property structure must not be null."); if (httpVersion == null) httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); if (httpVersion == null) httpVersion = HeaderFramework.HTTP_VERSION_1_1; - if (responseHeader == null) responseHeader = new ResponseHeader(); + if (responseHeader == null) responseHeader = new ResponseHeader(httpStatusCode); try { if ((httpStatusText == null)||(httpStatusText.length()==0)) { diff --git a/source/de/anomic/server/serverCore.java b/source/de/anomic/server/serverCore.java index 75617fc29..a9b8c8b96 100644 --- a/source/de/anomic/server/serverCore.java +++ b/source/de/anomic/server/serverCore.java @@ -169,9 +169,9 @@ public final class serverCore extends AbstractBusyThread implements BusyThread { public static String clientAddress(final Socket s) { final InetAddress uAddr = s.getInetAddress(); - if (uAddr.isAnyLocalAddress()) return "127.0.0.1"; + if (uAddr.isAnyLocalAddress()) return Domains.LOCALHOST; String cIP = uAddr.getHostAddress(); - if (Domains.isLocal(cIP, null)) cIP = "127.0.0.1"; + if (Domains.isLocal(cIP, null)) cIP = Domains.LOCALHOST; return cIP; } @@ -966,6 +966,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread { public Restarter(final int delay) { this.delay = delay; } + @Override public void run() { // waiting for a while try { diff --git a/source/de/anomic/server/servletProperties.java b/source/de/anomic/server/servletProperties.java index 7bf73a697..86fa7df31 100644 --- a/source/de/anomic/server/servletProperties.java +++ b/source/de/anomic/server/servletProperties.java @@ -26,7 +26,7 @@ import net.yacy.cora.protocol.ResponseHeader; public class servletProperties extends serverObjects { private static final long serialVersionUID = 1L; - + public static final String PEER_STAT_VERSION = "version"; public static final String PEER_STAT_UPTIME = "uptime"; public static final String PEER_STAT_MYTIME = "mytime"; @@ -34,56 +34,62 @@ public class servletProperties extends serverObjects { public static final String PEER_STAT_CLIENTID = "clientid"; private String prefix=""; - + private ResponseHeader outgoingHeader; - + public servletProperties(){ super(); } - + public servletProperties(final serverObjects so) { super(so); } - + public void setOutgoingHeader(final ResponseHeader outgoingHeader) { this.outgoingHeader = outgoingHeader; } - + public ResponseHeader getOutgoingHeader() { - if(outgoingHeader == null) - return new ResponseHeader(); - return outgoingHeader; + if (this.outgoingHeader == null) return new ResponseHeader(200); + return this.outgoingHeader; } - + public void setPrefix(final String myprefix) { - prefix=myprefix; + this.prefix=myprefix; } - + + @Override public String put(final String key, final byte[] value) { - return super.put(prefix + key, value); + return super.put(this.prefix + key, value); } - + + @Override public long put(final String key, final long value) { - return super.put(prefix + key, value); + return super.put(this.prefix + key, value); } - + + @Override public long inc(final String key) { - return super.inc(prefix+key); + return super.inc(this.prefix+key); } - + + @Override public Object get(final String key, final Object dflt) { - return super.get(prefix+key, dflt); + return super.get(this.prefix+key, dflt); } - + + @Override public String get(final String key, final String dflt) { - return super.get(prefix+key, dflt); + return super.get(this.prefix+key, dflt); } - + + @Override public int getInt(final String key, final int dflt) { - return super.getInt(prefix+key, dflt); + return super.getInt(this.prefix+key, dflt); } - + + @Override public long getLong(final String key, final long dflt) { - return super.getLong(prefix+key, dflt); + return super.getLong(this.prefix+key, dflt); } } diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 05fc2580b..24592ee2d 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -349,8 +349,9 @@ public class MultiProtocolURI implements Serializable, Comparable= 0 && host.charAt(0) != '[') host = '[' + host + ']'; // IPv6 host must be enclosed in square brackets this.protocol = protocol; this.host = host; this.port = port; @@ -709,7 +710,7 @@ public class MultiProtocolURI implements Serializable, Comparable InetAddressLocatorClass; private static Method InetAddressLocatorGetLocaleInetAddressMethod; @@ -553,14 +562,16 @@ public class Domains { cacheHit_Insert++; } + final private static TimeLimiter timeLimiter = new SimpleTimeLimiter(Executors.newFixedThreadPool(20)); + /** * resolve a host address using a local DNS cache and a DNS lookup if necessary * @param host * @return the hosts InetAddress or null if the address cannot be resolved */ - public static InetAddress dnsResolve(String host) { - if ((host == null) || (host.length() == 0)) return null; - host = host.toLowerCase().trim(); + public static InetAddress dnsResolve(final String host0) { + if (host0 == null || host0.length() == 0) return null; + final String host = host0.toLowerCase().trim(); // try to simply parse the address InetAddress ip = parseInetAddress(host); if (ip != null) return ip; @@ -615,8 +626,23 @@ public class Domains { try { //final long t = System.currentTimeMillis(); Thread.currentThread().setName("Domains: DNS resolve of '" + host + "'"); // thread dump show which host is resolved - ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone - //ip = InetAddress.getByName(host); // this makes the DNS request to backbone + if (InetAddresses.isInetAddress(host)) { + try { + ip = InetAddresses.forString(host); + Log.logInfo("Domains", "using guava for host resolution:" + host); + } catch (IllegalArgumentException e) { + ip = null; + } + } + if (ip == null) { + ip = timeLimiter.callWithTimeout(new Callable() { + @Override + public InetAddress call() throws Exception { + return InetAddress.getByName(host); + } + }, 1000L, TimeUnit.MILLISECONDS, false); + //ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone + } //.out.println("DNSLOOKUP-*LOOKUP* " + host + ", time = " + (System.currentTimeMillis() - t) + "ms"); } catch (final Throwable e) { // add new entries @@ -663,7 +689,9 @@ public class Domains { public static final InetAddress parseInetAddress(String ip) { if (ip == null || ip.length() < 8) return null; - if (isLocalhost(ip)) ip = "127.0.0.1"; + ip = ip.trim(); + if (ip.charAt(0) == '[' && ip.charAt(ip.length() - 1) == ']') ip = ip.substring(1, ip.length() - 1); + if (isLocalhost(ip)) ip = "127.0.0.1"; // normalize to IPv4 here since that is the way to calculate the InetAddress final String[] ips = dotPattern.split(ip); if (ips.length != 4) return null; final byte[] ipb = new byte[4]; @@ -699,7 +727,6 @@ public class Domains { return nameCacheNoCachingPatterns.size(); } - private static String localHostName = "127.0.0.1"; private static Set localHostAddresses = new HashSet(); private static Set localHostNames = new HashSet(); static { @@ -708,7 +735,7 @@ public class Domains { if (localHostAddress != null) localHostAddresses.add(localHostAddress); } catch (final UnknownHostException e) {} try { - final InetAddress[] moreAddresses = InetAddress.getAllByName(localHostName); + final InetAddress[] moreAddresses = InetAddress.getAllByName(LOCALHOST_NAME); if (moreAddresses != null) localHostAddresses.addAll(Arrays.asList(moreAddresses)); } catch (final UnknownHostException e) {} @@ -735,13 +762,13 @@ public class Domains { // now look up the host name try { - localHostName = getHostName(InetAddress.getLocalHost()); + LOCALHOST_NAME = getHostName(InetAddress.getLocalHost()); } catch (final UnknownHostException e) {} // after the host name was resolved, we try to look up more local addresses // using the host name: try { - final InetAddress[] moreAddresses = InetAddress.getAllByName(localHostName); + final InetAddress[] moreAddresses = InetAddress.getAllByName(LOCALHOST_NAME); if (moreAddresses != null) localHostAddresses.addAll(Arrays.asList(moreAddresses)); } catch (final UnknownHostException e) { } diff --git a/source/net/yacy/cora/protocol/HeaderFramework.java b/source/net/yacy/cora/protocol/HeaderFramework.java index e7215f1e5..c8a593ecb 100644 --- a/source/net/yacy/cora/protocol/HeaderFramework.java +++ b/source/net/yacy/cora/protocol/HeaderFramework.java @@ -82,7 +82,6 @@ public class HeaderFramework extends TreeMap implements Map implements Map reverseMappingCache) { + public ResponseHeader(final int statusCode, final HashMap reverseMappingCache) { super(reverseMappingCache); + this.put(HeaderFramework.STATUS_CODE, Integer.toString(statusCode)); } public ResponseHeader(final HashMap reverseMappingCache, final Map othermap) { super(reverseMappingCache, othermap); } + public int getStatusCode() { + String statuscode = this.get(HeaderFramework.STATUS_CODE); + if (statuscode == null) return 200; + try { + return Integer.parseInt(statuscode); + } catch (NumberFormatException e) { + return 200; + } + } + public Date date() { final Date d = headerDate(HeaderFramework.DATE); if (d == null) return new Date(); else return d; diff --git a/source/net/yacy/cora/protocol/TimeoutRequest.java b/source/net/yacy/cora/protocol/TimeoutRequest.java index fed518a13..071759c19 100644 --- a/source/net/yacy/cora/protocol/TimeoutRequest.java +++ b/source/net/yacy/cora/protocol/TimeoutRequest.java @@ -70,6 +70,7 @@ public class TimeoutRequest { try { final Future taskFuture = service.submit(this.call); final Runnable t = new Runnable() { + @Override public void run() { taskFuture.cancel(true); } }; service.execute(t); @@ -109,6 +110,7 @@ public class TimeoutRequest { */ public static boolean ping(final String host, final int port, final int timeout) throws ExecutionException { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { //long time = System.currentTimeMillis(); try { @@ -133,25 +135,6 @@ public class TimeoutRequest { }).call(timeout).booleanValue(); } - /** - * do a DNS lookup within a given time - * @param host - * @param timeout - * @return the InetAddress for a given domain name - * @throws ExecutionException - */ - public static InetAddress getByName(final String host, final long timeout) throws ExecutionException { - return new TimeoutRequest(new Callable() { - public InetAddress call() { - try { - return InetAddress.getByName(host); - } catch (final UnknownHostException e) { - return null; - } - } - }).call(timeout); - } - /** * perform a reverse domain name lookup for a given InetAddress within a given timeout * @param i @@ -161,6 +144,7 @@ public class TimeoutRequest { */ public static String getHostName(final InetAddress i, final long timeout) throws ExecutionException { return new TimeoutRequest(new Callable() { + @Override public String call() { return i.getHostName(); } }).call(timeout); } @@ -175,6 +159,7 @@ public class TimeoutRequest { public static boolean exists(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { try { return file.exists(); } catch (final SmbException e) { @@ -196,6 +181,7 @@ public class TimeoutRequest { public static boolean canRead(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { try { return file.canRead(); } catch (final SmbException e) { @@ -217,6 +203,7 @@ public class TimeoutRequest { public static boolean canWrite(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { try { return file.canWrite(); } catch (final SmbException e) { @@ -238,6 +225,7 @@ public class TimeoutRequest { public static boolean isHidden(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { try { return file.isHidden(); } catch (final SmbException e) { @@ -259,6 +247,7 @@ public class TimeoutRequest { public static boolean isDirectory(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { try { return file.isDirectory(); } catch (final SmbException e) { @@ -280,6 +269,7 @@ public class TimeoutRequest { public static long length(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Long call() { try { return file.length(); } catch (final SmbException e) { @@ -301,6 +291,7 @@ public class TimeoutRequest { public static long lastModified(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Long call() { try { return file.lastModified(); } catch (final SmbException e) { @@ -322,6 +313,7 @@ public class TimeoutRequest { public static String[] list(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public String[] call() { try { return file.list(); } catch (final SmbException e) { @@ -334,11 +326,4 @@ public class TimeoutRequest { } } - public static void main(final String[] args) { - try { - System.out.println(getByName("yacy.net", 100)); - } catch (final ExecutionException e) { - e.printStackTrace(); - } - } } diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java index d5bfd252a..be022b872 100644 --- a/source/net/yacy/cora/protocol/http/HTTPClient.java +++ b/source/net/yacy/cora/protocol/http/HTTPClient.java @@ -47,6 +47,7 @@ import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ConnectionInfo; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import org.apache.http.Header; @@ -156,7 +157,7 @@ public class HTTPClient { // connections per host (2 default) clientConnectionManager.setDefaultMaxPerRoute(2); // Increase max connections for localhost - final HttpHost localhost = new HttpHost("localhost"); + final HttpHost localhost = new HttpHost(Domains.LOCALHOST); clientConnectionManager.setMaxPerRoute(new HttpRoute(localhost), maxcon); /** * HTTP protocol settings @@ -339,7 +340,7 @@ public class HTTPClient { * @throws IOException */ public byte[] GETbytes(final MultiProtocolURI url, final int maxBytes) throws IOException { - final boolean localhost = url.getHost().equals("localhost"); + final boolean localhost = Domains.isLocalhost(url.getHost()); final String urix = url.toNormalform(true, false); final HttpGet httpGet = new HttpGet(urix); if (!localhost) setHost(url.getHost()); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service @@ -395,7 +396,7 @@ public class HTTPClient { final MultiProtocolURI url = new MultiProtocolURI(uri); final HttpPost httpPost = new HttpPost(url.toNormalform(true, false)); String host = url.getHost(); - if (host == null) host = "127.0.0.1"; + if (host == null) host = Domains.LOCALHOST; setHost(host); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service final NonClosingInputStreamEntity inputStreamEntity = new NonClosingInputStreamEntity(instream, length); // statistics @@ -432,7 +433,7 @@ public class HTTPClient { final HttpPost httpPost = new HttpPost(url.toNormalform(true, false)); setHost(vhost); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service - if (vhost == null) setHost("127.0.0.1"); + if (vhost == null) setHost(Domains.LOCALHOST); final MultipartEntity multipartEntity = new MultipartEntity(); for (final Entry part : post.entrySet()) @@ -462,7 +463,7 @@ public class HTTPClient { final MultiProtocolURI url = new MultiProtocolURI(uri); final HttpPost httpPost = new HttpPost(url.toNormalform(true, false)); String host = url.getHost(); - if (host == null) host = "127.0.0.1"; + if (host == null) host = Domains.LOCALHOST; setHost(host); // overwrite resolved IP, needed for shared web hosting DO NOT REMOVE, see http://en.wikipedia.org/wiki/Shared_web_hosting_service final InputStreamEntity inputStreamEntity = new InputStreamEntity(instream, length); @@ -708,17 +709,18 @@ public class HTTPClient { final SSLSocketFactory sslSF = new SSLSocketFactory(sslContext, SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER); return sslSF; } - + /** * If the Keep-Alive header is not present in the response, * HttpClient assumes the connection can be kept alive indefinitely. - * Here we limit this to 5 seconds. - * + * Here we limit this to 5 seconds. + * * @param defaultHttpClient */ private static void addCustomKeepAliveStrategy(final DefaultHttpClient defaultHttpClient) { defaultHttpClient.setKeepAliveStrategy(new ConnectionKeepAliveStrategy() { - public long getKeepAliveDuration(HttpResponse response, HttpContext context) { + @Override + public long getKeepAliveDuration(HttpResponse response, HttpContext context) { // Honor 'keep-alive' header String param, value; HeaderElement element; @@ -726,7 +728,7 @@ public class HTTPClient { response.headerIterator(HTTP.CONN_KEEP_ALIVE)); while (it.hasNext()) { element = it.nextElement(); - param = element.getName(); + param = element.getName(); value = element.getValue(); if (value != null && param.equalsIgnoreCase("timeout")) { try { diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java similarity index 50% rename from source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java rename to source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java index 876c60a64..981eda488 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java @@ -1,11 +1,7 @@ /** - * SolrSingleConnector - * Copyright 2011 by Michael Peter Christen - * First released 14.04.2011 at http://yacy.net - * - * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ - * $LastChangedRevision: 7654 $ - * $LastChangedBy: orbiter $ + * AbstractSolrConnector + * Copyright 2012 by Michael Peter Christen + * First released 21.06.2012 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -26,101 +22,67 @@ package net.yacy.cora.services.federated.solr; import java.io.File; import java.io.IOException; -import java.net.InetAddress; import java.util.ArrayList; import java.util.Collection; import java.util.List; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.protocol.Domains; import net.yacy.kelondro.logging.Log; +import net.yacy.search.index.SolrField; -import org.apache.http.HttpHost; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.AuthCache; -import org.apache.http.client.protocol.ClientContext; -import org.apache.http.impl.auth.BasicScheme; -import org.apache.http.impl.client.BasicAuthCache; -import org.apache.http.impl.client.BasicCredentialsProvider; -import org.apache.http.impl.client.DefaultHttpClient; -import org.apache.http.protocol.HttpContext; import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; -import net.yacy.search.index.SolrField; +public class AbstractSolrConnector implements SolrConnector { -public class SolrSingleConnector implements SolrConnector { + protected SolrServer server; + protected int commitWithinMs; // max time (in ms) before a commit will happen - private final String solrurl, host, solrpath, solraccount, solrpw; - private final int port; - private HttpSolrServer server; + protected AbstractSolrConnector() { + this.server = null; + this.commitWithinMs = 180000; + } + + protected void init(SolrServer server) { + this.server = server; + } + + public SolrServer getServer() { + return this.server; + } /** - * create a new solr connector - * @param url the solr url, like http://192.168.1.60:8983/solr/ or http://admin:pw@192.168.1.60:8983/solr/ - * @param scheme - * @throws IOException + * get the solr autocommit delay + * @return the maximum waiting time after a solr command until it is transported to the server */ - public SolrSingleConnector(final String url) throws IOException { - this.solrurl = url; + @Override + public int getCommitWithinMs() { + return this.commitWithinMs; + } - // connect using authentication - final MultiProtocolURI u = new MultiProtocolURI(this.solrurl); - this.host = u.getHost(); - this.port = u.getPort(); - this.solrpath = u.getPath(); - final String userinfo = u.getUserInfo(); - if (userinfo == null || userinfo.length() == 0) { - this.solraccount = ""; this.solrpw = ""; - } else { - final int p = userinfo.indexOf(':'); - if (p < 0) { - this.solraccount = userinfo; this.solrpw = ""; - } else { - this.solraccount = userinfo.substring(0, p); this.solrpw = userinfo.substring(p + 1); - } - } - if (this.solraccount.length() > 0) { - final DefaultHttpClient client = new DefaultHttpClient() { - @Override - protected HttpContext createHttpContext() { - HttpContext context = super.createHttpContext(); - AuthCache authCache = new BasicAuthCache(); - BasicScheme basicAuth = new BasicScheme(); - HttpHost targetHost = new HttpHost(u.getHost(), u.getPort(), u.getProtocol()); - authCache.put(targetHost, basicAuth); - context.setAttribute(ClientContext.AUTH_CACHE, authCache); - return context; - } - }; - BasicCredentialsProvider credsProvider = new BasicCredentialsProvider(); - credsProvider.setCredentials(new AuthScope(this.host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(this.solraccount, this.solrpw)); - client.setCredentialsProvider(credsProvider); - this.server = new HttpSolrServer("http://" + this.host + ":" + this.port + this.solrpath, client); - } else { - this.server = new HttpSolrServer(this.solrurl); - } - this.server.setAllowCompression(true); - this.server.setConnectionTimeout(60000); - this.server.setMaxRetries(1); // Solr-Doc: No more than 1 recommended (depreciated) - this.server.setSoTimeout(60000); + /** + * set the solr autocommit delay + * @param c the maximum waiting time after a solr command until it is transported to the server + */ + @Override + public void setCommitWithinMs(int c) { + this.commitWithinMs = c; } @Override public synchronized void close() { try { this.server.commit(); + this.server = null; } catch (SolrServerException e) { - e.printStackTrace(); + Log.logException(e); } catch (IOException e) { - e.printStackTrace(); + Log.logException(e); } } @@ -196,7 +158,7 @@ public class SolrSingleConnector implements SolrConnector { @Override public void add(final SolrDoc solrdoc) throws IOException, SolrException { try { - this.server.add(solrdoc,180000); // commitWithIn 180s + this.server.add(solrdoc, this.commitWithinMs); //this.server.commit(); } catch (SolrServerException e) { Log.logWarning("SolrConnector", e.getMessage() + " DOC=" + solrdoc.toString()); @@ -209,7 +171,7 @@ public class SolrSingleConnector implements SolrConnector { ArrayList l = new ArrayList(); for (SolrDoc d: solrdocs) l.add(d); try { - this.server.add(l,180000); // commitWithIn 120s + this.server.add(l, this.commitWithinMs); //this.server.commit(); } catch (SolrServerException e) { Log.logWarning("SolrConnector", e.getMessage() + " DOC=" + solrdocs.toString()); @@ -251,37 +213,4 @@ public class SolrSingleConnector implements SolrConnector { //return result; } - - public String getAdminInterface() { - final InetAddress localhostExternAddress = Domains.myPublicLocalIP(); - final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress(); - String u = this.solrurl; - int p = u.indexOf("localhost",0); if (p < 0) p = u.indexOf("127.0.0.1",0); - if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9); - return u + (u.endsWith("/") ? "admin/" : "/admin/"); - } - - public static void main(final String args[]) { - SolrSingleConnector solr; - try { - solr = new SolrSingleConnector("http://127.0.0.1:8983/solr"); - solr.clear(); - final File exampleDir = new File("test/parsertest/"); - long t, t0, a = 0; - int c = 0; - System.out.println("push files in " + exampleDir.getAbsolutePath() + " to Solr"); - for (final String s: exampleDir.list()) { - if (s.startsWith(".")) continue; - t = System.currentTimeMillis(); - solr.add(new File(exampleDir, s), s); - t0 = (System.currentTimeMillis() - t); - a += t0; - c++; - System.out.println("pushed file " + s + " to solr, " + t0 + " milliseconds"); - } - System.out.println("pushed " + c + " files in " + a + " milliseconds, " + (a / c) + " milliseconds average; " + (60000 / a * c) + " PPM"); - } catch (final IOException e) { - e.printStackTrace(); - } - } } diff --git a/source/net/yacy/cora/services/federated/solr/SolrMultipleConnector.java b/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java similarity index 76% rename from source/net/yacy/cora/services/federated/solr/SolrMultipleConnector.java rename to source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java index d9b5147a3..73483ce9b 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrMultipleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java @@ -8,18 +8,20 @@ import java.util.concurrent.ArrayBlockingQueue; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; -public class SolrMultipleConnector implements SolrConnector { +public class MultipleSolrConnector implements SolrConnector { private final static SolrDoc POISON_DOC = new SolrDoc(); private final ArrayBlockingQueue queue; private final AddWorker[] worker; private final SolrConnector solr; + private int commitWithinMs; - public SolrMultipleConnector(final String url, int connections) throws IOException { - this.solr = new SolrSingleConnector(url); + public MultipleSolrConnector(final String url, int connections) throws IOException { + this.solr = new SingleSolrConnector(url); this.queue = new ArrayBlockingQueue(1000); this.worker = new AddWorker[connections]; + this.commitWithinMs = 180000; for (int i = 0; i < connections; i++) { this.worker[i] = new AddWorker(url); this.worker[i].start(); @@ -29,13 +31,14 @@ public class SolrMultipleConnector implements SolrConnector { private class AddWorker extends Thread { private final SolrConnector solr; public AddWorker(final String url) throws IOException { - this.solr = new SolrSingleConnector(url); + this.solr = new SingleSolrConnector(url); + this.solr.setCommitWithinMs(MultipleSolrConnector.this.commitWithinMs); } @Override public void run() { SolrDoc doc; try { - while ((doc = SolrMultipleConnector.this.queue.take()) != POISON_DOC) { + while ((doc = MultipleSolrConnector.this.queue.take()) != POISON_DOC) { try { this.solr.add(doc); } catch (SolrException e) { @@ -51,6 +54,22 @@ public class SolrMultipleConnector implements SolrConnector { } } + @Override + public int getCommitWithinMs() { + return this.commitWithinMs; + } + + /** + * set the solr autocommit delay + * @param c the maximum waiting time after a solr command until it is transported to the server + */ + @Override + public void setCommitWithinMs(int c) { + this.commitWithinMs = c; + this.solr.setCommitWithinMs(c); + for (AddWorker w: this.worker) w.solr.setCommitWithinMs(c); + } + @Override public void close() { for (@SuppressWarnings("unused") AddWorker element : this.worker) { @@ -59,8 +78,8 @@ public class SolrMultipleConnector implements SolrConnector { } catch (InterruptedException e) { e.printStackTrace(); } - this.solr.close(); } + this.solr.close(); } @Override diff --git a/source/net/yacy/cora/services/federated/solr/SolrRetryConnector.java b/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java similarity index 91% rename from source/net/yacy/cora/services/federated/solr/SolrRetryConnector.java rename to source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java index b28134952..f3863732a 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrRetryConnector.java +++ b/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java @@ -31,16 +31,30 @@ import java.util.List; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; -public class SolrRetryConnector implements SolrConnector { +public class RetrySolrConnector implements SolrConnector { private final SolrConnector solrConnector; private final long retryMaxTime; - public SolrRetryConnector(final SolrConnector solrConnector, final long retryMaxTime) { + public RetrySolrConnector(final SolrConnector solrConnector, final long retryMaxTime) { this.solrConnector = solrConnector; this.retryMaxTime = retryMaxTime; } + @Override + public int getCommitWithinMs() { + return this.solrConnector.getCommitWithinMs(); + } + + /** + * set the solr autocommit delay + * @param c the maximum waiting time after a solr command until it is transported to the server + */ + @Override + public void setCommitWithinMs(int c) { + this.solrConnector.setCommitWithinMs(c); + } + @Override public synchronized void close() { this.solrConnector.close(); diff --git a/source/net/yacy/cora/services/federated/solr/SolrShardingSelection.java b/source/net/yacy/cora/services/federated/solr/ShardSelection.java similarity index 96% rename from source/net/yacy/cora/services/federated/solr/SolrShardingSelection.java rename to source/net/yacy/cora/services/federated/solr/ShardSelection.java index ae86a3411..c303a56db 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrShardingSelection.java +++ b/source/net/yacy/cora/services/federated/solr/ShardSelection.java @@ -33,7 +33,7 @@ import java.security.NoSuchAlgorithmException; import java.util.concurrent.atomic.AtomicLong; import net.yacy.search.index.SolrField; -public class SolrShardingSelection { +public class ShardSelection { public final static Charset charsetUTF8; static { @@ -47,7 +47,7 @@ public class SolrShardingSelection { MODULO_HOST_MD5, ROUND_ROBIN; } - public SolrShardingSelection(final Method method, final int dimension) { + public ShardSelection(final Method method, final int dimension) { this.method = method; this.dimension = dimension; this.chardID = new AtomicLong(0); diff --git a/source/net/yacy/cora/services/federated/solr/SolrShardingConnector.java b/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java similarity index 82% rename from source/net/yacy/cora/services/federated/solr/SolrShardingConnector.java rename to source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java index 0249165f2..45edd8d04 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrShardingConnector.java +++ b/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java @@ -37,22 +37,36 @@ import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; -public class SolrShardingConnector implements SolrConnector { +public class ShardSolrConnector implements SolrConnector { private final List connectors; - private final SolrShardingSelection sharding; + private final ShardSelection sharding; private final String[] urls; - public SolrShardingConnector(final String urlList, final SolrShardingSelection.Method method, final long timeout, boolean multipleConnections) throws IOException { + public ShardSolrConnector(final String urlList, final ShardSelection.Method method, final long timeout, boolean multipleConnections) throws IOException { urlList.replace(' ', ','); this.urls = urlList.split(","); this.connectors = new ArrayList(); SolrConnector s; for (final String u: this.urls) { - s = multipleConnections ? new SolrMultipleConnector(u.trim(), 2) : new SolrSingleConnector(u.trim()); - this.connectors.add(new SolrRetryConnector(s, timeout)); + s = multipleConnections ? new MultipleSolrConnector(u.trim(), 2) : new SingleSolrConnector(u.trim()); + this.connectors.add(new RetrySolrConnector(s, timeout)); } - this.sharding = new SolrShardingSelection(method, this.urls.length); + this.sharding = new ShardSelection(method, this.urls.length); + } + + @Override + public int getCommitWithinMs() { + return this.connectors.get(0).getCommitWithinMs(); + } + + /** + * set the solr autocommit delay + * @param c the maximum waiting time after a solr command until it is transported to the server + */ + @Override + public void setCommitWithinMs(int c) { + for (final SolrConnector connector: this.connectors) connector.setCommitWithinMs(c); } @Override @@ -175,9 +189,11 @@ public class SolrShardingConnector implements SolrConnector { final String[] urlAdmin = new String[this.connectors.size()]; int i = 0; final InetAddress localhostExternAddress = Domains.myPublicLocalIP(); - final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress(); + final String localhostExtern = localhostExternAddress == null ? Domains.LOCALHOST : localhostExternAddress.getHostAddress(); for (String u: this.urls) { - int p = u.indexOf("localhost",0); if (p < 0) p = u.indexOf("127.0.0.1",0); + int p = u.indexOf("localhost",0); + if (p < 0) p = u.indexOf("127.0.0.1",0); + if (p < 0) p = u.indexOf("0:0:0:0:0:0:0:1",0); if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9); urlAdmin[i++] = u + (u.endsWith("/") ? "admin/" : "/admin/"); } diff --git a/source/net/yacy/cora/services/federated/solr/SingleSolrConnector.java b/source/net/yacy/cora/services/federated/solr/SingleSolrConnector.java new file mode 100644 index 000000000..28de5b84d --- /dev/null +++ b/source/net/yacy/cora/services/federated/solr/SingleSolrConnector.java @@ -0,0 +1,140 @@ +/** + * SolrSingleConnector + * Copyright 2011 by Michael Peter Christen + * First released 14.04.2011 at http://yacy.net + * + * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ + * $LastChangedRevision: 7654 $ + * $LastChangedBy: orbiter $ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.services.federated.solr; + +import java.io.File; +import java.io.IOException; +import java.net.InetAddress; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.protocol.Domains; + +import org.apache.http.HttpHost; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.AuthCache; +import org.apache.http.client.protocol.ClientContext; +import org.apache.http.impl.auth.BasicScheme; +import org.apache.http.impl.client.BasicAuthCache; +import org.apache.http.impl.client.BasicCredentialsProvider; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.protocol.HttpContext; +import org.apache.solr.client.solrj.impl.HttpSolrServer; + + +public class SingleSolrConnector extends AbstractSolrConnector implements SolrConnector { + + private final String solrurl, host, solrpath, solraccount, solrpw; + private final int port; + + /** + * create a new solr connector + * @param url the solr url, like http://192.168.1.60:8983/solr/ or http://admin:pw@192.168.1.60:8983/solr/ + * @param scheme + * @throws IOException + */ + public SingleSolrConnector(final String url) throws IOException { + super(); + this.solrurl = url; + + // connect using authentication + final MultiProtocolURI u = new MultiProtocolURI(this.solrurl); + this.host = u.getHost(); + this.port = u.getPort(); + this.solrpath = u.getPath(); + final String userinfo = u.getUserInfo(); + if (userinfo == null || userinfo.length() == 0) { + this.solraccount = ""; this.solrpw = ""; + } else { + final int p = userinfo.indexOf(':'); + if (p < 0) { + this.solraccount = userinfo; this.solrpw = ""; + } else { + this.solraccount = userinfo.substring(0, p); this.solrpw = userinfo.substring(p + 1); + } + } + HttpSolrServer s; + if (this.solraccount.length() > 0) { + final DefaultHttpClient client = new DefaultHttpClient() { + @Override + protected HttpContext createHttpContext() { + HttpContext context = super.createHttpContext(); + AuthCache authCache = new BasicAuthCache(); + BasicScheme basicAuth = new BasicScheme(); + HttpHost targetHost = new HttpHost(u.getHost(), u.getPort(), u.getProtocol()); + authCache.put(targetHost, basicAuth); + context.setAttribute(ClientContext.AUTH_CACHE, authCache); + return context; + } + }; + BasicCredentialsProvider credsProvider = new BasicCredentialsProvider(); + credsProvider.setCredentials(new AuthScope(this.host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(this.solraccount, this.solrpw)); + client.setCredentialsProvider(credsProvider); + s = new HttpSolrServer("http://" + this.host + ":" + this.port + this.solrpath, client); + } else { + s = new HttpSolrServer(this.solrurl); + } + s.setAllowCompression(true); + s.setConnectionTimeout(60000); + s.setMaxRetries(1); // Solr-Doc: No more than 1 recommended (depreciated) + s.setSoTimeout(60000); + super.init(s); + } + + public String getAdminInterface() { + final InetAddress localhostExternAddress = Domains.myPublicLocalIP(); + final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress(); + String u = this.solrurl; + int p = u.indexOf("localhost",0); + if (p < 0) p = u.indexOf("127.0.0.1",0); + if (p < 0) p = u.indexOf("0:0:0:0:0:0:0:1",0); + if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9); + return u + (u.endsWith("/") ? "admin/" : "/admin/"); + } + + public static void main(final String args[]) { + SingleSolrConnector solr; + try { + solr = new SingleSolrConnector("http://127.0.0.1:8983/solr"); + solr.clear(); + final File exampleDir = new File("test/parsertest/"); + long t, t0, a = 0; + int c = 0; + System.out.println("push files in " + exampleDir.getAbsolutePath() + " to Solr"); + for (final String s: exampleDir.list()) { + if (s.startsWith(".")) continue; + t = System.currentTimeMillis(); + solr.add(new File(exampleDir, s), s); + t0 = (System.currentTimeMillis() - t); + a += t0; + c++; + System.out.println("pushed file " + s + " to solr, " + t0 + " milliseconds"); + } + System.out.println("pushed " + c + " files in " + a + " milliseconds, " + (a / c) + " milliseconds average; " + (60000 / a * c) + " PPM"); + } catch (final IOException e) { + e.printStackTrace(); + } + } +} diff --git a/source/net/yacy/cora/services/federated/solr/SolrConnector.java b/source/net/yacy/cora/services/federated/solr/SolrConnector.java index 3180e2837..a05e38d35 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrConnector.java @@ -33,6 +33,21 @@ import org.apache.solr.common.SolrException; public interface SolrConnector { + /** + * get the solr autocommit delay + * @return the maximum waiting time after a solr command until it is transported to the server + */ + public int getCommitWithinMs(); + + /** + * set the solr autocommit delay + * @param c the maximum waiting time after a solr command until it is transported to the server + */ + public void setCommitWithinMs(int c); + + /** + * close the server connection + */ public void close(); /** diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java deleted file mode 100644 index f58a910d2..000000000 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ /dev/null @@ -1,32 +0,0 @@ -/** - * SolrScheme - * Copyright 2011 by Michael Peter Christen - * First released 09.05.2012 at http://yacy.net - * - * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ - * $LastChangedRevision: 7654 $ - * $LastChangedBy: orbiter $ - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.cora.services.federated.solr; - - -public interface SolrScheme { - - public SolrDoc toSolr(); - -} diff --git a/source/net/yacy/cora/storage/ConfigurationSet.java b/source/net/yacy/cora/storage/ConfigurationSet.java index 3277b6328..2e0ac8ef4 100644 --- a/source/net/yacy/cora/storage/ConfigurationSet.java +++ b/source/net/yacy/cora/storage/ConfigurationSet.java @@ -38,7 +38,6 @@ import java.util.logging.Level; import java.util.logging.Logger; import net.yacy.cora.storage.ConfigurationSet.Entry; -import net.yacy.kelondro.util.FileUtils; import net.yacy.search.index.SolrField; /** * this class reads configuration attributes as a list of keywords from a list @@ -199,7 +198,7 @@ public class ConfigurationSet extends TreeMap implements Serializa if (this.file == null) return; // create a temporary bak file, use it as template to preserve user comments File bakfile = new File (this.file.getAbsolutePath() + ".bak"); - FileUtils.copy (this.file, bakfile); + Files.copy (this.file, bakfile); @SuppressWarnings("unchecked") TreeMap tclone = (TreeMap) this.clone(); // clone to write appended entries diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 9bfecb957..42fb1f190 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -781,6 +781,7 @@ dc_rights for (final Document doc: docs) { + if (doc == null) continue; final String author = doc.dc_creator(); if (author.length() > 0) { if (authors.length() > 0) authors.append(","); diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index d4a2ba225..da27a2fd5 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -261,7 +261,7 @@ public final class TextParser { if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); try { final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream); - for (final Document d: docs) { assert d.getText() != null; } // verify docs + for (final Document d: docs) { assert d != null && d.getText() != null; } // verify docs return docs; } catch (final Exception e) { throw new Parser.Failure("parser failed: " + parser.getName(), location); diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 386722b59..18f7cbe54 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -625,6 +625,17 @@ public class ContentScraper extends AbstractScraper implements Scraper { return this.li.toArray(new String[this.li.size()]); } + public MultiProtocolURI[] getFlash() { + String ext; + ArrayList f = new ArrayList(); + for (final MultiProtocolURI url: this.anchors.keySet()) { + ext = url.getFileExtension(); + if (ext == null) continue; + if (ext.equals("swf")) f.add(url); + } + return f.toArray(new MultiProtocolURI[f.size()]); + } + public boolean containsFlash() { String ext; for (final MultiProtocolURI url: this.anchors.keySet()) { diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaTripleImpl.java b/source/net/yacy/document/parser/rdfa/impl/RDFaTripleImpl.java index 35dc378c5..b1a125f4b 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaTripleImpl.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaTripleImpl.java @@ -14,13 +14,12 @@ import javax.xml.transform.TransformerFactory; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; +import net.yacy.yacy; import net.yacy.document.parser.rdfa.IRDFaTriple; import net.yacy.kelondro.logging.Log; -import net.yacy.yacy; - public class RDFaTripleImpl{ - + private static Templates templates = null; private String propertyURI = null; private String subjectURI = null; @@ -30,59 +29,52 @@ public class RDFaTripleImpl{ private String value = null; private String dataType = null; private String language = null; - private Reader in; - private Transformer aTransformer; - private ArrayList allRDFaTriples = new ArrayList(); + private final Reader in; + private final Transformer aTransformer; + private final ArrayList allRDFaTriples = new ArrayList(); public RDFaTripleImpl(Reader in, String base) throws IOException, TransformerException, TransformerConfigurationException { - + BufferedReader bufReader = new BufferedReader(in); String readLine = bufReader.readLine(); if (!readLine.toLowerCase().contains(" to.lastModified()) try { - FileUtils.copy(from, to); + Files.copy(from, to); } catch (final IOException e) {} } } @@ -118,7 +120,7 @@ public class migration { }else{ try { mkdirs(styleFile.getParentFile()); - FileUtils.copy(skinFile, styleFile); + Files.copy(skinFile, styleFile); Log.logInfo("MIGRATION", "copied new Skinfile"); } catch (final IOException e) { Log.logSevere("MIGRATION", "Cannot copy skinfile."); @@ -164,7 +166,7 @@ public class migration { sb.wikiDB.close(); file2 = new File(sb.workPath, "wiki.db"); try { - FileUtils.copy(file, file2); + Files.copy(file, file2); file.delete(); } catch (final IOException e) { } @@ -174,7 +176,7 @@ public class migration { Log.logInfo("MIGRATION", "Migrating wiki-bkp.db to "+ sb.workPath); file2 = new File(sb.workPath, "wiki-bkp.db"); try { - FileUtils.copy(file, file2); + Files.copy(file, file2); file.delete(); } catch (final IOException e) {} } @@ -192,7 +194,7 @@ public class migration { sb.messageDB.close(); file2=new File(sb.workPath, "message.db"); try { - FileUtils.copy(file, file2); + Files.copy(file, file2); file.delete(); } catch (final IOException e) {} try { diff --git a/source/net/yacy/peers/Network.java b/source/net/yacy/peers/Network.java index fb99b599b..27e81ff30 100644 --- a/source/net/yacy/peers/Network.java +++ b/source/net/yacy/peers/Network.java @@ -351,6 +351,7 @@ public class Network } } } catch ( final Exception e ) { + Log.logException(e); log.logSevere( "publishThread: error with target seed " + this.seed.toString() + ": " + e.getMessage(), e); diff --git a/source/net/yacy/peers/Seed.java b/source/net/yacy/peers/Seed.java index 1adb3461f..8e5cd8bb2 100644 --- a/source/net/yacy/peers/Seed.java +++ b/source/net/yacy/peers/Seed.java @@ -305,8 +305,8 @@ public class Seed implements Cloneable, Comparable, Comparator * @return the IP or null */ public final String getIP() { - final String ip = get(Seed.IP, "127.0.0.1"); - return (ip == null || ip.length() == 0) ? "127.0.0.1" : ip; + final String ip = get(Seed.IP, Domains.LOCALHOST); + return (ip == null || ip.length() == 0) ? Domains.LOCALHOST : ip; } /** @@ -563,7 +563,7 @@ public class Seed implements Cloneable, Comparable, Comparator public final String getPublicAddress() { String ip = getIP(); if ( ip == null || ip.length() < 8 || ip.length() > 60 ) { - ip = "127.0.0.1"; + ip = Domains.LOCALHOST; } final String port = this.dna.get(Seed.PORT); diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index eb69c65e1..796686dee 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -52,6 +52,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.services.federated.yacy.CacheStrategy; +import net.yacy.cora.storage.Files; import net.yacy.document.Document; import net.yacy.document.parser.tarParser; import net.yacy.kelondro.data.meta.DigestURI; @@ -308,7 +309,8 @@ public final class yacyRelease extends yacyVersion { } client.setTimout(120000); client.GET(getUrl().toString()); - final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader header = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); final boolean unzipped = header.gzip() && (header.mime().toLowerCase().equals("application/x-tar")); // if true, then the httpc has unzipped the file if (unzipped && name.endsWith(".tar.gz")) { @@ -495,7 +497,7 @@ public final class yacyRelease extends yacyVersion { final File InfoPlistSource = new File(sb.getDataPath(), "DATA/RELEASE/yacy/addon/YaCy.app/Contents/Info.plist"); final File InfoPlistDestination = new File(sb.getAppPath(), "addon/YaCy.app/Contents/Info.plist"); if (InfoPlistSource.exists() && InfoPlistDestination.exists()) { - FileUtils.copy(InfoPlistSource, InfoPlistDestination); + Files.copy(InfoPlistSource, InfoPlistDestination); Log.logInfo("UPDATE", "replaced Info.plist"); } } diff --git a/source/net/yacy/peers/operation/yacySeedUploadFile.java b/source/net/yacy/peers/operation/yacySeedUploadFile.java index c7e35d1ca..930ec81d9 100644 --- a/source/net/yacy/peers/operation/yacySeedUploadFile.java +++ b/source/net/yacy/peers/operation/yacySeedUploadFile.java @@ -1,4 +1,4 @@ -//yacySeedUploadFile.java +//yacySeedUploadFile.java //------------------------------------- //part of YACY //(C) by Michael Peter Christen; mc@yacy.net @@ -27,30 +27,32 @@ package net.yacy.peers.operation; import java.io.File; -import net.yacy.kelondro.util.FileUtils; +import com.google.common.io.Files; import de.anomic.server.serverSwitch; public class yacySeedUploadFile implements yacySeedUploader { - + public static final String CONFIG_FILE_PATH = "seedFilePath"; + @Override public String uploadSeedFile(final serverSwitch sb, final File seedFile) throws Exception { - + String seedFilePath = ""; try { seedFilePath = sb.getConfig(CONFIG_FILE_PATH,""); if (seedFilePath.length() == 0) throw new Exception("Path to seed file is not configured properly"); - - final File publicSeedFile = new File(seedFilePath); - FileUtils.copy(seedFile,publicSeedFile); - + + final File publicSeedFile = new File(seedFilePath); + Files.copy(seedFile,publicSeedFile); + return "Seed-List file stored successfully"; } catch (final Exception e) { throw new Exception("Unable to store the seed-list file into the filesystem using path '" + seedFilePath + "'. " + e.getMessage()); } } + @Override public String[] getConfigurationOptions() { return new String[]{CONFIG_FILE_PATH}; } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 0b579ba9e..e782c61e7 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -217,7 +217,6 @@ public final class LoaderDispatcher { request, requestHeader, cachedResponse, - "200", crawlProfile, true, content); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 4fd8657cb..92aa5b950 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -93,9 +93,10 @@ import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.protocol.http.ProxySettings; +import net.yacy.cora.services.federated.solr.ShardSelection; +import net.yacy.cora.services.federated.solr.ShardSolrConnector; +import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.cora.services.federated.solr.SolrDoc; -import net.yacy.cora.services.federated.solr.SolrShardingConnector; -import net.yacy.cora.services.federated.solr.SolrShardingSelection; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -151,6 +152,9 @@ import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEventCache; import net.yacy.search.ranking.BlockRank; import net.yacy.search.ranking.RankingProfile; + +import com.google.common.io.Files; + import de.anomic.crawler.Cache; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlQueues; @@ -273,7 +277,7 @@ public final class Switchboard extends serverSwitch // check if port is already occupied final int port = getConfigInt("port", 8090); try { - if ( TimeoutRequest.ping("127.0.0.1", port, 500) ) { + if ( TimeoutRequest.ping(Domains.LOCALHOST, port, 500) ) { throw new RuntimeException( "a server is already running on the YaCy port " + port @@ -392,7 +396,7 @@ public final class Switchboard extends serverSwitch getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list"); final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/" + schemename); if ( !solrWorkProfile.exists() ) { - FileUtils.copy(solrBackupProfile, solrWorkProfile); + Files.copy(solrBackupProfile, solrWorkProfile); } final SolrConfiguration backupScheme = new SolrConfiguration(solrBackupProfile); this.solrScheme = new SolrConfiguration(solrWorkProfile); @@ -404,16 +408,19 @@ public final class Switchboard extends serverSwitch // set up the solr interface final String solrurls = getConfig("federated.service.solr.indexing.url", "http://127.0.0.1:8983/solr"); final boolean usesolr = getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; + int commitWithinMs = getConfigInt("federated.service.solr.indexing.commitWithinMs", 180000); - try { - this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr( - (usesolr) ? new SolrShardingConnector( - solrurls, - SolrShardingSelection.Method.MODULO_HOST_MD5, - 10000, true) : null); - } catch ( final IOException e ) { - Log.logException(e); - this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null); + if (usesolr && solrurls != null && solrurls.length() > 0) { + try { + SolrConnector solr = new ShardSolrConnector( + solrurls, + ShardSelection.Method.MODULO_HOST_MD5, + 10000, true); + solr.setCommitWithinMs(commitWithinMs); + this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectRemoteSolr(solr); + } catch ( final IOException e ) { + Log.logException(e); + } } // initialize network database @@ -731,7 +738,7 @@ public final class Switchboard extends serverSwitch getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT), "notifier.gif"); try { - FileUtils.copy(notifierSource, notifierDest); + Files.copy(notifierSource, notifierDest); } catch ( final IOException e ) { } @@ -1800,7 +1807,7 @@ public final class Switchboard extends serverSwitch 0, 0, 0); - response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile, false); + response = new Response(request, null, null, this.crawler.defaultSurrogateProfile, false); final indexingQueueEntry queueEntry = new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[] { document @@ -2432,8 +2439,18 @@ public final class Switchboard extends serverSwitch public indexingQueueEntry condenseDocument(final indexingQueueEntry in) { in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING); - if ( this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr() != null - && getConfigBool("federated.service.solr.indexing.enabled", false)/*in.queueEntry.profile().pushSolr()*/) { + if ( !in.queueEntry.profile().indexText() && !in.queueEntry.profile().indexMedia() ) { + if ( this.log.isInfo() ) { + this.log.logInfo("Not Condensed Resource '" + + in.queueEntry.url().toNormalform(false, true) + + "': indexing not wanted by crawl profile"); + } + return new indexingQueueEntry(in.process, in.queueEntry, in.documents, null); + } + + boolean localSolr = this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getLocalSolr() != null && getConfig("federated.service.yacy.indexing.engine", "classic").equals("solr"); + boolean remoteSolr = this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getRemoteSolr() != null && getConfigBool("federated.service.solr.indexing.enabled", false); + if (localSolr || remoteSolr) { // send the documents to solr for ( final Document doc : in.documents ) { try { @@ -2452,7 +2469,8 @@ public final class Switchboard extends serverSwitch } try { SolrDoc solrDoc = this.solrScheme.yacy2solr(id, in.queueEntry.getResponseHeader(), doc); - this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr().add(solrDoc); + if (localSolr) this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getLocalSolr().add(solrDoc); + if (remoteSolr) this.indexSegments.segment(Segments.Process.LOCALCRAWLING).getRemoteSolr().add(solrDoc); } catch ( final IOException e ) { Log.logWarning( "SOLR", @@ -2469,7 +2487,7 @@ public final class Switchboard extends serverSwitch } // check if we should accept the document for our index - if ( !getConfigBool("federated.service.yacy.indexing.enabled", false) ) { + if (!getConfig("federated.service.yacy.indexing.engine", "classic").equals("classic")) { if ( this.log.isInfo() ) { this.log.logInfo("Not Condensed Resource '" + in.queueEntry.url().toNormalform(false, true) @@ -2477,14 +2495,6 @@ public final class Switchboard extends serverSwitch } return new indexingQueueEntry(in.process, in.queueEntry, in.documents, null); } - if ( !in.queueEntry.profile().indexText() && !in.queueEntry.profile().indexMedia() ) { - if ( this.log.isInfo() ) { - this.log.logInfo("Not Condensed Resource '" - + in.queueEntry.url().toNormalform(false, true) - + "': indexing not wanted by crawl profile"); - } - return new indexingQueueEntry(in.process, in.queueEntry, in.documents, null); - } final List doclist = new ArrayList(); // check which files may take part in the indexing process @@ -3347,7 +3357,8 @@ public final class Switchboard extends serverSwitch url = new DigestURI(seedListFileURL); //final long start = System.currentTimeMillis(); client.HEADResponse(url.toString()); - header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + header = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); //final long loadtime = System.currentTimeMillis() - start; /*if (header == null) { if (loadtime > getConfigLong("bootstrapLoadTimeout", 6000)) { diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index a25763844..30af3a83b 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -157,6 +157,7 @@ public class DocumentIndex extends Segment final URIMetadataRow[] rows = new URIMetadataRow[documents.length]; int c = 0; for ( final Document document : documents ) { + if (document == null) continue; final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true); rows[c++] = super.storeDocument( diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java index 0ce804a23..6ae2f93eb 100644 --- a/source/net/yacy/search/index/MetadataRepository.java +++ b/source/net/yacy/search/index/MetadataRepository.java @@ -61,6 +61,8 @@ import net.yacy.kelondro.table.SplitTable; import net.yacy.kelondro.util.MemoryControl; import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist.BlacklistType; +import net.yacy.search.Switchboard; +import net.yacy.search.solr.EmbeddedSolrConnector; import de.anomic.crawler.CrawlStacker; public final class MetadataRepository implements /*Metadata,*/ Iterable { @@ -71,7 +73,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable private final File location; private final String tablename; private ArrayList statsDump; - private SolrConnector solr; + private SolrConnector localSolr, remoteSolr; public MetadataRepository( final File path, @@ -85,15 +87,27 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable this.urlIndexFile = backupIndex; //new Cache(backupIndex, 20000000, 20000000); this.exportthread = null; // will have a export thread assigned if exporter is running this.statsDump = null; - this.solr = null; + this.remoteSolr = null; + this.localSolr = null; } - public void connectSolr(final SolrConnector solr) { - this.solr = solr; + public void connectRemoteSolr(final SolrConnector solr) { + this.remoteSolr = solr; } - public SolrConnector getSolr() { - return this.solr; + public void connectLocalSolr() throws IOException { + File solrLocation = this.location; + if (solrLocation.getName().equals("default")) solrLocation = solrLocation.getParentFile(); + solrLocation = new File(solrLocation, "solr"); + this.localSolr = new EmbeddedSolrConnector(solrLocation, new File(new File(Switchboard.getSwitchboard().appPath,"defaults"), "solr")); + } + + public SolrConnector getRemoteSolr() { + return this.remoteSolr; + } + + public SolrConnector getLocalSolr() { + return this.localSolr; } public void clearCache() { @@ -123,7 +137,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable this.urlIndexFile.close(); this.urlIndexFile = null; } - if (this.solr != null) this.solr.close(); + if (this.remoteSolr != null) this.remoteSolr.close(); + if (this.localSolr != null) this.localSolr.close(); } public int writeCacheSize() { @@ -207,7 +222,7 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable public boolean exists(final byte[] urlHash) { if (urlHash == null) return false; try { - if (this.solr != null && this.solr.exists(ASCII.String(urlHash))) { + if (this.remoteSolr != null && this.remoteSolr.exists(ASCII.String(urlHash))) { return true; } } catch (final Throwable e) { diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 210a3f280..cbf35264d 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -152,14 +152,23 @@ public class Segment { // create LURL-db this.urlMetadata = new MetadataRepository(segmentPath, "text.urlmd", useTailCache, exceed134217727); + //this.connectLocalSolr(); } - public void connectSolr(final SolrConnector solr) { - this.urlMetadata.connectSolr(solr); + public void connectRemoteSolr(final SolrConnector solr) { + this.urlMetadata.connectRemoteSolr(solr); } - public SolrConnector getSolr() { - return this.urlMetadata.getSolr(); + public void connectLocalSolr() throws IOException { + this.urlMetadata.connectLocalSolr(); + } + + public SolrConnector getRemoteSolr() { + return this.urlMetadata.getRemoteSolr(); + } + + public SolrConnector getLocalSolr() { + return this.urlMetadata.getLocalSolr(); } public MetadataRepository urlMetadata() { diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index f898ed248..d8a0b2adb 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -86,7 +86,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable it.remove(); } } - } + } protected void addSolr(final SolrDoc solrdoc, final SolrField key, final String value) { if (isEmpty() || contains(key.name())) solrdoc.addSolr(key, value); @@ -344,7 +344,15 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable } // flash embedded - addSolr(solrdoc, SolrField.flash_b, html.containsFlash()); + if (isEmpty() || contains(SolrField.flash_b.name())) { + MultiProtocolURI[] flashURLs = html.getFlash(); + for (MultiProtocolURI u: flashURLs) { + // remove all flash links from ibound/outbound links + inboundLinks.remove(u); + ouboundLinks.remove(u); + } + addSolr(solrdoc, SolrField.flash_b, flashURLs.length > 0); + } // generic evaluation pattern for (final String model: html.getEvaluationModelNames()) { @@ -446,7 +454,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable addSolr(solrdoc, SolrField.lon_coordinate, yacydoc.lon()); addSolr(solrdoc, SolrField.lat_coordinate, yacydoc.lat()); } - addSolr(solrdoc, SolrField.httpstatus_i, 200); + addSolr(solrdoc, SolrField.httpstatus_i, header.getStatusCode()); return solrdoc; } diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index b5d8f8611..16fa751da 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -448,7 +448,7 @@ public class SnippetProcess { this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime); this.neededResults = neededResults; this.shallrun = true; - this.solr = SnippetProcess.this.rankingProcess.getQuery().getSegment().getSolr(); + this.solr = SnippetProcess.this.rankingProcess.getQuery().getSegment().getRemoteSolr(); } @Override diff --git a/source/net/yacy/search/solr/EmbeddedSolrConnector.java b/source/net/yacy/search/solr/EmbeddedSolrConnector.java new file mode 100644 index 000000000..89d982718 --- /dev/null +++ b/source/net/yacy/search/solr/EmbeddedSolrConnector.java @@ -0,0 +1,117 @@ +/** + * EmbeddedSolrConnector + * Copyright 2012 by Michael Peter Christen + * First released 21.06.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.search.solr; + +import java.io.File; +import java.io.IOException; + +import javax.xml.parsers.ParserConfigurationException; + +import net.yacy.cora.services.federated.solr.AbstractSolrConnector; +import net.yacy.cora.services.federated.solr.SolrConnector; +import net.yacy.cora.services.federated.solr.SolrDoc; +import net.yacy.kelondro.logging.Log; +import net.yacy.search.index.SolrField; + +import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.core.CoreContainer; +import org.xml.sax.SAXException; + +import com.google.common.io.Files; + +public class EmbeddedSolrConnector extends AbstractSolrConnector implements SolrConnector { + + private final CoreContainer core; + private final static String[] confFiles = {"solrconfig.xml", "schema.xml", "stopwords.txt", "synonyms.txt", "protwords.txt", "currency.xml", "elevate.xml", "lang/"}; + //private final static String[] confFiles = {"solrconfig.xml", "schema.xml", "stopwords.txt", "synonyms.txt", "protwords.txt", "currency.xml", "elevate.xml", "lang/"}; + + public EmbeddedSolrConnector(File storagePath, File solr_config) throws IOException { + super(); + // copy the solrconfig.xml to the storage path + File conf = new File(storagePath, "conf"); + conf.mkdirs(); + File source, target; + for (String cf: confFiles) { + source = new File(solr_config, cf); + if (source.isDirectory()) { + target = new File(conf, cf); + target.mkdirs(); + for (String cfl: source.list()) { + Files.copy(new File(source, cfl), new File(target, cfl)); + } + } else { + target = new File(conf, cf); + target.getParentFile().mkdirs(); + Files.copy(source, target); + } + } + try { + this.core = new CoreContainer(storagePath.getAbsolutePath(), new File(solr_config, "solr.xml")); + } catch (ParserConfigurationException e) { + throw new IOException(e.getMessage(), e); + } catch (SAXException e) { + throw new IOException(e.getMessage(), e); + } + super.init(new EmbeddedSolrServer(this.core, "collection1")); + } + + @Override + public void close() { + super.close(); + this.core.shutdown(); + } + + public static void main(String[] args) { + File solr_config = new File("defaults/solr"); + File storage = new File("DATA/INDEX/webportal/SEGMENTS/text/solr/"); + storage.mkdirs(); + try { + EmbeddedSolrConnector solr = new EmbeddedSolrConnector(storage, solr_config); + SolrDoc solrdoc = new SolrDoc(); + solrdoc.addSolr(SolrField.id, "ABCD0000abcd"); + solrdoc.addSolr(SolrField.title, "Lorem ipsum"); + solrdoc.addSolr(SolrField.text_t, "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."); + solr.add(solrdoc); + SolrDocumentList searchresult = solr.get(SolrField.text_t.name() + ":tempor", 0, 10); + for (SolrDocument d: searchresult) { + System.out.println(d.toString()); + } + solr.close(); + /* + JettySolrRunner solrJetty = new JettySolrRunner("/solr", 8091, storage.getAbsolutePath()); + try { + solrJetty.start(); + String url = "http://localhost:" + solrJetty.getLocalPort() + "/solr"; + SolrServer server = new HttpSolrServer(url); + } catch (Exception e) { + e.printStackTrace(); + } + */ + } catch (IOException e) { + Log.logException(e); + } + + } + +} diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java index 2d0f92b9b..ac62f5674 100644 --- a/source/net/yacy/yacy.java +++ b/source/net/yacy/yacy.java @@ -80,6 +80,9 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.MetadataRepository; import net.yacy.search.index.Segment; + +import com.google.common.io.Files; + import de.anomic.data.Translator; import de.anomic.http.server.HTTPDemon; import de.anomic.server.serverCore; @@ -181,7 +184,7 @@ public final class yacy { f = new File(dataHome, "DATA/LOG/yacy.logging"); final File f0 = new File(appHome, "defaults/yacy.logging"); if (!f.exists() || f0.lastModified() > f.lastModified()) try { - FileUtils.copy(f0, f); + Files.copy(f0, f); } catch (final IOException e){ System.out.println("could not copy yacy.logging"); } @@ -268,7 +271,7 @@ public final class yacy { // create default notifier picture //TODO: Use templates instead of copying images ... if (!((new File(htDocsPath, "notifier.gif")).exists())) try { - FileUtils.copy(new File(htRootPath, "env/grafics/empty.gif"), + Files.copy(new File(htRootPath, "env/grafics/empty.gif"), new File(htDocsPath, "notifier.gif")); } catch (final IOException e) {}