diff --git a/.classpath b/.classpath index 45b1ff7f1..e6ba6ace3 100644 --- a/.classpath +++ b/.classpath @@ -12,7 +12,6 @@ - @@ -30,9 +29,6 @@ - - - @@ -45,8 +41,24 @@ - + + + + + + + + + + + + + + + + + @@ -55,5 +67,6 @@ + diff --git a/addon/YaCy.app/Contents/Info.plist b/addon/YaCy.app/Contents/Info.plist index cbf9172d8..b8d5c6ce8 100644 --- a/addon/YaCy.app/Contents/Info.plist +++ b/addon/YaCy.app/Contents/Info.plist @@ -37,8 +37,10 @@ ClassPath $JAVAROOT/htroot + $JAVAROOT/lib/J7Zip-modified.jar $JAVAROOT/lib/activation.jar $JAVAROOT/lib/apache-mime4j-0.6.jar + $JAVAROOT/lib/apache-solr-core-3.6.0.jar $JAVAROOT/lib/apache-solr-solrj-3.6.0.jar $JAVAROOT/lib/arq-2.8.7.jar $JAVAROOT/lib/bcmail-jdk15-145.jar @@ -46,11 +48,14 @@ $JAVAROOT/lib/commons-codec-1.6.jar $JAVAROOT/lib/commons-compress-1.4.1.jar $JAVAROOT/lib/commons-fileupload-1.2.2.jar + $JAVAROOT/lib/commons-httpclient-3.1.jar $JAVAROOT/lib/commons-io-2.1.jar $JAVAROOT/lib/commons-jxpath-1.3.jar + $JAVAROOT/lib/commons-lang-2.6.jar $JAVAROOT/lib/commons-logging-1.1.1.jar - $JAVAROOT/lib/fontbox-1.6.0.jar + $JAVAROOT/lib/fontbox-1.7.0.jar $JAVAROOT/lib/geronimo-stax-api_1.0_spec-1.0.1.jar + $JAVAROOT/lib/guava-r05.jar $JAVAROOT/lib/htmllexer.jar $JAVAROOT/lib/htmlparser.jar $JAVAROOT/lib/httpclient-4.2.jar @@ -58,22 +63,30 @@ $JAVAROOT/lib/httpmime-4.2.jar $JAVAROOT/lib/icu4j-core.jar $JAVAROOT/lib/iri-0.8.jar - $JAVAROOT/lib/J7Zip-modified.jar $JAVAROOT/lib/jakarta-oro-2.0.8.jar $JAVAROOT/lib/jcifs-1.3.15.jar $JAVAROOT/lib/jcl-over-slf4j-1.6.1.jar - $JAVAROOT/lib/jempbox-1.6.0.jar + $JAVAROOT/lib/jempbox-1.7.0.jar $JAVAROOT/lib/jena-2.6.4.jar + $JAVAROOT/lib/jetty-6.1.26-patched-JETTY-1340.jar + $JAVAROOT/lib/jetty-util-6.1.26-patched-JETTY-1340.jar $JAVAROOT/lib/jsch-0.1.42.jar $JAVAROOT/lib/json-simple-1.1.jar $JAVAROOT/lib/log4j-1.2.16.jar + $JAVAROOT/lib/log4j-over-slf4j-1.6.1.jar + $JAVAROOT/lib/lucene-analyzers-3.6.0.jar + $JAVAROOT/lib/lucene-core-3.6.0.jar + $JAVAROOT/lib/lucene-highlighter-3.6.0.jar + $JAVAROOT/lib/lucene-phonetic-3.6.0.jar + $JAVAROOT/lib/lucene-spatial-3.6.0.jar + $JAVAROOT/lib/lucene-spellchecker-3.6.0.jar $JAVAROOT/lib/metadata-extractor-2.4.0-beta-1.jar $JAVAROOT/lib/mysql-connector-java-5.1.12-bin.jar - $JAVAROOT/lib/pdfbox-1.6.0.jar + $JAVAROOT/lib/pdfbox-1.7.0.jar $JAVAROOT/lib/poi-3.6-20091214.jar $JAVAROOT/lib/poi-scratchpad-3.6-20091214.jar $JAVAROOT/lib/sax-2.0.1.jar - $JAVAROOT/lib/servlet-api.jar + $JAVAROOT/lib/servlet-api-2.5-20081211.jar $JAVAROOT/lib/slf4j-api-1.6.1.jar $JAVAROOT/lib/slf4j-jdk14-1.6.1.jar $JAVAROOT/lib/webcat-0.1-swf.jar diff --git a/build.xml b/build.xml index b786882fa..12bdba82e 100644 --- a/build.xml +++ b/build.xml @@ -156,8 +156,10 @@ + + @@ -165,11 +167,14 @@ + + - + + @@ -177,22 +182,30 @@ - - + + + + + + + + + + - + - + @@ -200,7 +213,7 @@ - + @@ -247,14 +260,6 @@ - - - - - - - - - - - - - @@ -516,10 +515,9 @@ - - - + + + diff --git a/RDFaParser/RDFaParser.xsl b/defaults/RDFaParser.xsl similarity index 100% rename from RDFaParser/RDFaParser.xsl rename to defaults/RDFaParser.xsl diff --git a/defaults/solr/currency.xml b/defaults/solr/currency.xml new file mode 100644 index 000000000..3a9c58afe --- /dev/null +++ b/defaults/solr/currency.xml @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/defaults/solr/elevate.xml b/defaults/solr/elevate.xml new file mode 100644 index 000000000..25d5cebe4 --- /dev/null +++ b/defaults/solr/elevate.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + diff --git a/defaults/solr/lang/contractions_ca.txt b/defaults/solr/lang/contractions_ca.txt new file mode 100644 index 000000000..307a85f91 --- /dev/null +++ b/defaults/solr/lang/contractions_ca.txt @@ -0,0 +1,8 @@ +# Set of Catalan contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +l +m +n +s +t diff --git a/defaults/solr/lang/contractions_fr.txt b/defaults/solr/lang/contractions_fr.txt new file mode 100644 index 000000000..722db5883 --- /dev/null +++ b/defaults/solr/lang/contractions_fr.txt @@ -0,0 +1,9 @@ +# Set of French contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +l +m +t +qu +n +s +j diff --git a/defaults/solr/lang/contractions_ga.txt b/defaults/solr/lang/contractions_ga.txt new file mode 100644 index 000000000..9ebe7fa34 --- /dev/null +++ b/defaults/solr/lang/contractions_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +m +b diff --git a/defaults/solr/lang/contractions_it.txt b/defaults/solr/lang/contractions_it.txt new file mode 100644 index 000000000..cac040953 --- /dev/null +++ b/defaults/solr/lang/contractions_it.txt @@ -0,0 +1,23 @@ +# Set of Italian contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d diff --git a/defaults/solr/lang/hyphenations_ga.txt b/defaults/solr/lang/hyphenations_ga.txt new file mode 100644 index 000000000..4d2642cc5 --- /dev/null +++ b/defaults/solr/lang/hyphenations_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish hyphenations for StopFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +h +n +t diff --git a/defaults/solr/lang/stemdict_nl.txt b/defaults/solr/lang/stemdict_nl.txt new file mode 100644 index 000000000..441072971 --- /dev/null +++ b/defaults/solr/lang/stemdict_nl.txt @@ -0,0 +1,6 @@ +# Set of overrides for the dutch stemmer +# TODO: load this as a resource from the analyzer and sync it in build.xml +fiets fiets +bromfiets bromfiets +ei eier +kind kinder diff --git a/defaults/solr/lang/stoptags_ja.txt b/defaults/solr/lang/stoptags_ja.txt new file mode 100644 index 000000000..71b750845 --- /dev/null +++ b/defaults/solr/lang/stoptags_ja.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#名詞 +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#名詞-一般 +# +# noun-proper: Proper nouns where the sub-classification is undefined +#名詞-固有名詞 +# +# noun-proper-misc: miscellaneous proper nouns +#名詞-固有名詞-一般 +# +# noun-proper-person: Personal names where the sub-classification is undefined +#名詞-固有名詞-人名 +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. お市の方 +#名詞-固有名詞-人名-一般 +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. 山田 +#名詞-固有名詞-人名-姓 +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太郎 +#名詞-固有名詞-人名-名 +# +# noun-proper-organization: Names representing organizations. +# e.g. 通産省, NHK +#名詞-固有名詞-組織 +# +# noun-proper-place: Place names where the sub-classification is undefined +#名詞-固有名詞-地域 +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. アジア, バルセロナ, 京都 +#名詞-固有名詞-地域-一般 +# +# noun-proper-place-country: Country names. +# e.g. 日本, オーストラリア +#名詞-固有名詞-地域-国 +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#名詞-代名詞 +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ +#名詞-代名詞-一般 +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ +#名詞-代名詞-縮約 +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. 金曜, 一月, 午後, 少量 +#名詞-副詞可能 +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (する, できる, なさる, くださる) +# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り +#名詞-サ変接続 +# +# noun-adjective-base: The base form of adjectives, words that appear before な ("na") +# e.g. 健康, 安易, 駄目, だめ +#名詞-形容動詞語幹 +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. +# e.g. 0, 1, 2, 何, 数, 幾 +#名詞-数 +# +# noun-affix: noun affixes where the sub-classification is undefined +#名詞-非自立 +# +# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, +# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, +# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, +# わり, 割り, 割, ん-口語/, もん-口語/ +#名詞-非自立-一般 +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, +# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, +# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, +# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, +# 儘, 侭, みぎり, 矢先 +#名詞-非自立-副詞可能 +# +# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars +# with the stem よう(だ) ("you(da)"). +# e.g. よう, やう, 様 (よう) +#名詞-非自立-助動詞語幹 +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form な (aux "da"). +# e.g. みたい, ふう +#名詞-非自立-形容動詞語幹 +# +# noun-special: special nouns where the sub-classification is undefined. +#名詞-特殊 +# +# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is +# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. そう +#名詞-特殊-助動詞語幹 +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#名詞-接尾 +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ガル or タイ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# 接尾語 ("suffix") and is usually the last element in a compound noun. +# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, +# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 +#名詞-接尾-一般 +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. 君, 様, 著 +#名詞-接尾-人名 +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. 町, 市, 県 +#名詞-接尾-地域 +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before スル ("suru"). +# e.g. 化, 視, 分け, 入り, 落ち, 買い +#名詞-接尾-サ変接続 +# +# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, +# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. そう +#名詞-接尾-助動詞語幹 +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula だ ("da"). +# e.g. 的, げ, がち +#名詞-接尾-形容動詞語幹 +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) +#名詞-接尾-副詞可能 +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach +# to numbers. +# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 +#名詞-接尾-助数詞 +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽し) さ, (考え) 方 +#名詞-接尾-特殊 +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) +#名詞-接続詞的 +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are +# semantically verb-like. +# e.g. ごらん, ご覧, 御覧, 頂戴 +#名詞-動詞非自立的 +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") +# is いわく ("iwaku"). +#名詞-引用文字列 +# +# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and +# behave like an adjective. +# e.g. 申し訳, 仕方, とんでも, 違い +#名詞-ナイ形容詞語幹 +# +##### +# prefix: unclassified prefixes +#接頭詞 +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) +#接頭詞-名詞接続 +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by なる/なさる/くださる. +# e.g. お (読みなさい), お (座り) +#接頭詞-動詞接続 +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. お (寒いですねえ), バカ (でかい) +#接頭詞-形容詞接続 +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. 約, およそ, 毎時 +#接頭詞-数接続 +# +##### +# verb: unclassified verbs +#動詞 +# +# verb-main: +#動詞-自立 +# +# verb-auxiliary: +#動詞-非自立 +# +# verb-suffix: +#動詞-接尾 +# +##### +# adjective: unclassified adjectives +#形容詞 +# +# adjective-main: +#形容詞-自立 +# +# adjective-auxiliary: +#形容詞-非自立 +# +# adjective-suffix: +#形容詞-接尾 +# +##### +# adverb: unclassified adverbs +#副詞 +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. あいかわらず, 多分 +#副詞-一般 +# +# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, +# な, する, だ, etc. +# e.g. こんなに, そんなに, あんなに, なにか, なんでも +#副詞-助詞類接続 +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, +# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, +# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き +#連体詞 +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. が, けれども, そして, じゃあ, それどころか +接続詞 +# +##### +# particle: unclassified particles. +助詞 +# +# particle-case: case particles where the subclassification is undefined. +助詞-格助詞 +# +# particle-case-misc: Case particles. +# e.g. から, が, で, と, に, へ, より, を, の, にて +助詞-格助詞-一般 +# +# particle-case-quote: the "to" that appears after nouns, a person’s speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) +助詞-格助詞-引用 +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, +# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, +# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, +# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, +# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, +# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, +# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, +# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ +助詞-格助詞-連語 +# +# particle-conjunctive: +# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, +# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, +# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ +助詞-接続助詞 +# +# particle-dependency: +# e.g. こそ, さえ, しか, すら, は, も, ぞ +助詞-係助詞 +# +# particle-adverbial: +# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, +# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, +# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, +# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, +# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) +助詞-副助詞 +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (松島) や +助詞-間投助詞 +# +# particle-coordinate: +# e.g. と, たり, だの, だり, とか, なり, や, やら +助詞-並立助詞 +# +# particle-final: +# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, +# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ +助詞-終助詞 +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 +# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 +# 「(祈りが届いたせい) か (, 試験に合格した.)」 +# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 +# e.g. か +助詞-副助詞/並立助詞/終助詞 +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +助詞-連体化 +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. に, と +助詞-副詞化 +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) +助詞-特殊 +# +##### +# auxiliary-verb: +助動詞 +# +##### +# interjection: Greetings and other exclamations. +# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, +# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい +#感動詞 +# +##### +# symbol: unclassified Symbols. +記号 +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [○◎@$〒→+] +記号-一般 +# +# symbol-comma: Commas +# e.g. [,、] +記号-読点 +# +# symbol-period: Periods and full stops. +# e.g. [..。] +記号-句点 +# +# symbol-space: Full-width whitespace. +記号-空白 +# +# symbol-open_bracket: +# e.g. [({‘“『【] +記号-括弧開 +# +# symbol-close_bracket: +# e.g. [)}’”』」】] +記号-括弧閉 +# +# symbol-alphabetic: +#記号-アルファベット +# +##### +# other: unclassified other +#その他 +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (だ)ァ +その他-間投 +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. あの, うんと, えと +フィラー +# +##### +# non-verbal: non-verbal sound. +非言語音 +# +##### +# fragment: +#語断片 +# +##### +# unknown: unknown part of speech. +#未知語 +# +##### End of file diff --git a/defaults/solr/lang/stopwords_ar.txt b/defaults/solr/lang/stopwords_ar.txt new file mode 100644 index 000000000..046829db6 --- /dev/null +++ b/defaults/solr/lang/stopwords_ar.txt @@ -0,0 +1,125 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Cleaned on October 11, 2009 (not normalized, so use before normalization) +# This means that when modifying this list, you might need to add some +# redundant entries, for example containing forms with both أ and ا +من +ومن +منها +منه +في +وفي +فيها +فيه +و +ف +ثم +او +أو +ب +بها +به +ا +أ +اى +اي +أي +أى +لا +ولا +الا +ألا +إلا +لكن +ما +وما +كما +فما +عن +مع +اذا +إذا +ان +أن +إن +انها +أنها +إنها +انه +أنه +إنه +بان +بأن +فان +فأن +وان +وأن +وإن +التى +التي +الذى +الذي +الذين +الى +الي +إلى +إلي +على +عليها +عليه +اما +أما +إما +ايضا +أيضا +كل +وكل +لم +ولم +لن +ولن +هى +هي +هو +وهى +وهي +وهو +فهى +فهي +فهو +انت +أنت +لك +لها +له +هذه +هذا +تلك +ذلك +هناك +كانت +كان +يكون +تكون +وكانت +وكان +غير +بعض +قد +نحو +بين +بينما +منذ +ضمن +حيث +الان +الآن +خلال +بعد +قبل +حتى +عند +عندما +لدى +جميع diff --git a/defaults/solr/lang/stopwords_bg.txt b/defaults/solr/lang/stopwords_bg.txt new file mode 100644 index 000000000..1ae4ba2ae --- /dev/null +++ b/defaults/solr/lang/stopwords_bg.txt @@ -0,0 +1,193 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +а +аз +ако +ала +бе +без +беше +би +бил +била +били +било +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главно +го +д +да +дали +до +докато +докога +дори +досега +доста +е +едва +един +ето +за +зад +заедно +заради +засега +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +ли +м +ме +между +мен +ми +мнозина +мога +могат +може +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нея +ни +ние +никой +нито +но +някои +някой +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първо +с +са +само +се +сега +си +скоро +след +сме +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +тн +то +това +тогава +този +той +толкова +точно +трябва +тук +тъй +тя +тях +у +харесва +ч +че +често +чрез +ще +щом +я diff --git a/defaults/solr/lang/stopwords_ca.txt b/defaults/solr/lang/stopwords_ca.txt new file mode 100644 index 000000000..3da65deaf --- /dev/null +++ b/defaults/solr/lang/stopwords_ca.txt @@ -0,0 +1,220 @@ +# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) +a +abans +ací +ah +així +això +al +als +aleshores +algun +alguna +algunes +alguns +alhora +allà +allí +allò +altra +altre +altres +amb +ambdós +ambdues +apa +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquí +baix +cada +cadascú +cadascuna +cadascunes +cadascuns +com +contra +d'un +d'una +d'unes +d'uns +dalt +de +del +dels +des +després +dins +dintre +donat +doncs +durant +e +eh +el +els +em +en +encara +ens +entre +érem +eren +éreu +es +és +esta +està +estàvem +estaven +estàveu +esteu +et +etc +ets +fins +fora +gairebé +ha +han +has +havia +he +hem +heu +hi +ho +i +igual +iguals +ja +l'hi +la +les +li +li'n +llavors +m'he +ma +mal +malgrat +mateix +mateixa +mateixes +mateixos +me +mentre +més +meu +meus +meva +meves +molt +molta +moltes +molts +mon +mons +n'he +n'hi +ne +ni +no +nogensmenys +només +nosaltres +nostra +nostre +nostres +o +oh +oi +on +pas +pel +pels +per +però +perquè +poc +poca +pocs +poques +potser +propi +qual +quals +quan +quant +que +què +quelcom +qui +quin +quina +quines +quins +s'ha +s'han +sa +semblant +semblants +ses +seu +seus +seva +seva +seves +si +sobre +sobretot +sóc +solament +sols +son +són +sons +sota +sou +t'ha +t'han +t'he +ta +tal +també +tampoc +tan +tant +tanta +tantes +teu +teus +teva +teves +ton +tons +tot +tota +totes +tots +un +una +unes +uns +us +va +vaig +vam +van +vas +veu +vosaltres +vostra +vostre +vostres diff --git a/defaults/solr/lang/stopwords_cz.txt b/defaults/solr/lang/stopwords_cz.txt new file mode 100644 index 000000000..53c6097da --- /dev/null +++ b/defaults/solr/lang/stopwords_cz.txt @@ -0,0 +1,172 @@ +a +s +k +o +i +u +v +z +dnes +cz +tímto +budeš +budem +byli +jseš +můj +svým +ta +tomto +tohle +tuto +tyto +jej +zda +proč +máte +tato +kam +tohoto +kdo +kteří +mi +nám +tom +tomuto +mít +nic +proto +kterou +byla +toho +protože +asi +ho +naši +napište +re +což +tím +takže +svých +její +svými +jste +aj +tu +tedy +teto +bylo +kde +ke +pravé +ji +nad +nejsou +či +pod +téma +mezi +přes +ty +pak +vám +ani +když +však +neg +jsem +tento +článku +články +aby +jsme +před +pta +jejich +byl +ještě +až +bez +také +pouze +první +vaše +která +nás +nový +tipy +pokud +může +strana +jeho +své +jiné +zprávy +nové +není +vás +jen +podle +zde +už +být +více +bude +již +než +který +by +které +co +nebo +ten +tak +má +při +od +po +jsou +jak +další +ale +si +se +ve +to +jako +za +zpět +ze +do +pro +je +na +atd +atp +jakmile +přičemž +já +on +ona +ono +oni +ony +my +vy +jí +ji +mě +mne +jemu +tomu +těm +těmu +němu +němuž +jehož +jíž +jelikož +jež +jakož +načež diff --git a/defaults/solr/lang/stopwords_da.txt b/defaults/solr/lang/stopwords_da.txt new file mode 100644 index 000000000..a3ff5fe12 --- /dev/null +++ b/defaults/solr/lang/stopwords_da.txt @@ -0,0 +1,108 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +på | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/youself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +når | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +også | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sådan | such, like this/like that diff --git a/defaults/solr/lang/stopwords_de.txt b/defaults/solr/lang/stopwords_de.txt new file mode 100644 index 000000000..f77038418 --- /dev/null +++ b/defaults/solr/lang/stopwords_de.txt @@ -0,0 +1,292 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daß | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + diff --git a/defaults/solr/lang/stopwords_el.txt b/defaults/solr/lang/stopwords_el.txt new file mode 100644 index 000000000..232681f5b --- /dev/null +++ b/defaults/solr/lang/stopwords_el.txt @@ -0,0 +1,78 @@ +# Lucene Greek Stopwords list +# Note: by default this file is used after GreekLowerCaseFilter, +# so when modifying this file use 'σ' instead of 'ς' +ο +η +το +οι +τα +του +τησ +των +τον +την +και +κι +κ +ειμαι +εισαι +ειναι +ειμαστε +ειστε +στο +στον +στη +στην +μα +αλλα +απο +για +προσ +με +σε +ωσ +παρα +αντι +κατα +μετα +θα +να +δε +δεν +μη +μην +επι +ενω +εαν +αν +τοτε +που +πωσ +ποιοσ +ποια +ποιο +ποιοι +ποιεσ +ποιων +ποιουσ +αυτοσ +αυτη +αυτο +αυτοι +αυτων +αυτουσ +αυτεσ +αυτα +εκεινοσ +εκεινη +εκεινο +εκεινοι +εκεινεσ +εκεινα +εκεινων +εκεινουσ +οπωσ +ομωσ +ισωσ +οσο +οτι diff --git a/defaults/solr/lang/stopwords_en.txt b/defaults/solr/lang/stopwords_en.txt new file mode 100644 index 000000000..2c164c0b2 --- /dev/null +++ b/defaults/solr/lang/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with diff --git a/defaults/solr/lang/stopwords_es.txt b/defaults/solr/lang/stopwords_es.txt new file mode 100644 index 000000000..2db147600 --- /dev/null +++ b/defaults/solr/lang/stopwords_es.txt @@ -0,0 +1,354 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | from, of +la | the, her +que | who, that +el | the +en | in +y | and +a | to +los | the, them +del | de + el +se | himself, from him etc +las | the, them +por | for, by, etc +un | a +para | for +con | with +no | no +una | a +su | his, her +al | a + el + | es from SER +lo | him +como | how +más | more +pero | pero +sus | su plural +le | to him, her +ya | already +o | or + | fue from SER +este | this + | ha from HABER +sí | himself etc +porque | because +esta | this + | son from SER +entre | between + | está from ESTAR +cuando | when +muy | very +sin | without +sobre | on + | ser from SER + | tiene from TENER +también | also +me | me +hasta | until +hay | there is/are +donde | where + | han from HABER +quien | whom, that + | están from ESTAR + | estado from ESTAR +desde | from +todo | all +nos | us +durante | during + | estados from ESTAR +todos | all +uno | a +les | to them +ni | nor +contra | against +otros | other + | fueron from SER +ese | that +eso | that + | había from HABER +ante | before +ellos | they +e | and (variant of y) +esto | this +mí | me +antes | before +algunos | some +qué | what? +unos | a +yo | I +otro | other +otras | other +otra | other +él | he +tanto | so much, many +esa | that +estos | these +mucho | much, many +quienes | who +nada | nothing +muchos | many +cual | who + | sea from SER +poco | few +ella | she +estar | to be + | haber from HABER +estas | these + | estaba from ESTAR + | estamos from ESTAR +algunas | some +algo | something +nosotros | we + + | other forms + +mi | me +mis | mi plural +tú | thou +te | thee +ti | thee +tu | thy +tus | tu plural +ellas | they +nosotras | we +vosotros | you +vosotras | you +os | you +mío | mine +mía | +míos | +mías | +tuyo | thine +tuya | +tuyos | +tuyas | +suyo | his, hers, theirs +suya | +suyos | +suyas | +nuestro | ours +nuestra | +nuestros | +nuestras | +vuestro | yours +vuestra | +vuestros | +vuestras | +esos | those +esas | those + + | forms of estar, to be (not including the infinitive): +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad + + | forms of haber, to have (not including the infinitive): +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas + + | forms of ser, to be (not including the infinitive): +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +siendo +sido + | sed also means 'thirst' + + | forms of tener, to have (not including the infinitive): +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened + diff --git a/defaults/solr/lang/stopwords_eu.txt b/defaults/solr/lang/stopwords_eu.txt new file mode 100644 index 000000000..25f1db934 --- /dev/null +++ b/defaults/solr/lang/stopwords_eu.txt @@ -0,0 +1,99 @@ +# example set of basque stopwords +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten diff --git a/defaults/solr/lang/stopwords_fa.txt b/defaults/solr/lang/stopwords_fa.txt new file mode 100644 index 000000000..723641c6d --- /dev/null +++ b/defaults/solr/lang/stopwords_fa.txt @@ -0,0 +1,313 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Note: by default this file is used after normalization, so when adding entries +# to this file, use the arabic 'ي' instead of 'ی' +انان +نداشته +سراسر +خياه +ايشان +وي +تاكنون +بيشتري +دوم +پس +ناشي +وگو +يا +داشتند +سپس +هنگام +هرگز +پنج +نشان +امسال +ديگر +گروهي +شدند +چطور +ده +و +دو +نخستين +ولي +چرا +چه +وسط +ه +كدام +قابل +يك +رفت +هفت +همچنين +در +هزار +بله +بلي +شايد +اما +شناسي +گرفته +دهد +داشته +دانست +داشتن +خواهيم +ميليارد +وقتيكه +امد +خواهد +جز +اورده +شده +بلكه +خدمات +شدن +برخي +نبود +بسياري +جلوگيري +حق +كردند +نوعي +بعري +نكرده +نظير +نبايد +بوده +بودن +داد +اورد +هست +جايي +شود +دنبال +داده +بايد +سابق +هيچ +همان +انجا +كمتر +كجاست +گردد +كسي +تر +مردم +تان +دادن +بودند +سري +جدا +ندارند +مگر +يكديگر +دارد +دهند +بنابراين +هنگامي +سمت +جا +انچه +خود +دادند +زياد +دارند +اثر +بدون +بهترين +بيشتر +البته +به +براساس +بيرون +كرد +بعضي +گرفت +توي +اي +ميليون +او +جريان +تول +بر +مانند +برابر +باشيم +مدتي +گويند +اكنون +تا +تنها +جديد +چند +بي +نشده +كردن +كردم +گويد +كرده +كنيم +نمي +نزد +روي +قصد +فقط +بالاي +ديگران +اين +ديروز +توسط +سوم +ايم +دانند +سوي +استفاده +شما +كنار +داريم +ساخته +طور +امده +رفته +نخست +بيست +نزديك +طي +كنيد +از +انها +تمامي +داشت +يكي +طريق +اش +چيست +روب +نمايد +گفت +چندين +چيزي +تواند +ام +ايا +با +ان +ايد +ترين +اينكه +ديگري +راه +هايي +بروز +همچنان +پاعين +كس +حدود +مختلف +مقابل +چيز +گيرد +ندارد +ضد +همچون +سازي +شان +مورد +باره +مرسي +خويش +برخوردار +چون +خارج +شش +هنوز +تحت +ضمن +هستيم +گفته +فكر +بسيار +پيش +براي +روزهاي +انكه +نخواهد +بالا +كل +وقتي +كي +چنين +كه +گيري +نيست +است +كجا +كند +نيز +يابد +بندي +حتي +توانند +عقب +خواست +كنند +بين +تمام +همه +ما +باشند +مثل +شد +اري +باشد +اره +طبق +بعد +اگر +صورت +غير +جاي +بيش +ريزي +اند +زيرا +چگونه +بار +لطفا +مي +درباره +من +ديده +همين +گذاري +برداري +علت +گذاشته +هم +فوق +نه +ها +شوند +اباد +همواره +هر +اول +خواهند +چهار +نام +امروز +مان +هاي +قبل +كنم +سعي +تازه +را +هستند +زير +جلوي +عنوان +بود diff --git a/defaults/solr/lang/stopwords_fi.txt b/defaults/solr/lang/stopwords_fi.txt new file mode 100644 index 000000000..addad798c --- /dev/null +++ b/defaults/solr/lang/stopwords_fi.txt @@ -0,0 +1,95 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this +tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +niin | so +nyt | now +itse | self + diff --git a/defaults/solr/lang/stopwords_fr.txt b/defaults/solr/lang/stopwords_fr.txt new file mode 100644 index 000000000..c00837ea9 --- /dev/null +++ b/defaults/solr/lang/stopwords_fr.txt @@ -0,0 +1,183 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +au | a + le +aux | a + les +avec | with +ce | this +ces | these +dans | with +de | of +des | de + les +du | de + le +elle | she +en | `of them' etc +et | and +eux | them +il | he +je | I +la | the +le | the +leur | their +lui | him +ma | my (fem) +mais | but +me | me +même | same; as in moi-même (myself) etc +mes | me (pl) +moi | me +mon | my (masc) +ne | not +nos | our (pl) +notre | our +nous | we +on | one +ou | where +par | by +pas | not +pour | for +qu | que before vowel +que | that +qui | who +sa | his, her (fem) +se | oneself +ses | his (pl) +son | his, her (masc) +sur | on +ta | thy (fem) +te | thee +tes | thy (pl) +toi | thee +ton | thy (masc) +tu | thou +un | a +une | a +vos | your (pl) +votre | your +vous | you + + | single letter forms + +c | c' +d | d' +j | j' +l | l' +à | to, at +m | m' +n | n' +s | s' +t | t' +y | there + + | forms of être (not including the infinitive): +été +étée +étées +étés +étant +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent + + | forms of avoir (not including the infinitive): +ayant +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent + + | Later additions (from Jean-Christophe Deschamps) +ceci | this +celà  | that +cet | this +cette | this +ici | here +ils | they +les | the (pl) +leurs | their (pl) +quel | which +quels | which +quelle | which +quelles | which +sans | without +soi | oneself + diff --git a/defaults/solr/lang/stopwords_ga.txt b/defaults/solr/lang/stopwords_ga.txt new file mode 100644 index 000000000..9ff88d747 --- /dev/null +++ b/defaults/solr/lang/stopwords_ga.txt @@ -0,0 +1,110 @@ + +a +ach +ag +agus +an +aon +ar +arna +as +b' +ba +beirt +bhúr +caoga +ceathair +ceathrar +chomh +chtó +chuig +chun +cois +céad +cúig +cúigear +d' +daichead +dar +de +deich +deichniúr +den +dhá +do +don +dtí +dá +dár +dó +faoi +faoin +faoina +faoinár +fara +fiche +gach +gan +go +gur +haon +hocht +i +iad +idir +in +ina +ins +inár +is +le +leis +lena +lenár +m' +mar +mo +mé +na +nach +naoi +naonúr +ná +ní +níor +nó +nócha +ocht +ochtar +os +roimh +sa +seacht +seachtar +seachtó +seasca +seisear +siad +sibh +sinn +sna +sé +sí +tar +thar +thú +triúr +trí +trína +trínár +tríocha +tú +um +ár +é +éis +í +ó +ón +óna +ónár diff --git a/defaults/solr/lang/stopwords_gl.txt b/defaults/solr/lang/stopwords_gl.txt new file mode 100644 index 000000000..d8760b12c --- /dev/null +++ b/defaults/solr/lang/stopwords_gl.txt @@ -0,0 +1,161 @@ +# galican stopwords +a +aínda +alí +aquel +aquela +aquelas +aqueles +aquilo +aquí +ao +aos +as +así +á +ben +cando +che +co +coa +comigo +con +connosco +contigo +convosco +coas +cos +cun +cuns +cunha +cunhas +da +dalgunha +dalgunhas +dalgún +dalgúns +das +de +del +dela +delas +deles +desde +deste +do +dos +dun +duns +dunha +dunhas +e +el +ela +elas +eles +en +era +eran +esa +esas +ese +eses +esta +estar +estaba +está +están +este +estes +estiven +estou +eu +é +facer +foi +foron +fun +había +hai +iso +isto +la +las +lle +lles +lo +los +mais +me +meu +meus +min +miña +miñas +moi +na +nas +neste +nin +no +non +nos +nosa +nosas +noso +nosos +nós +nun +nunha +nuns +nunhas +o +os +ou +ó +ós +para +pero +pode +pois +pola +polas +polo +polos +por +que +se +senón +ser +seu +seus +sexa +sido +sobre +súa +súas +tamén +tan +te +ten +teñen +teño +ter +teu +teus +ti +tido +tiña +tiven +túa +túas +un +unha +unhas +uns +vos +vosa +vosas +voso +vosos +vós diff --git a/defaults/solr/lang/stopwords_hi.txt b/defaults/solr/lang/stopwords_hi.txt new file mode 100644 index 000000000..86286bb08 --- /dev/null +++ b/defaults/solr/lang/stopwords_hi.txt @@ -0,0 +1,235 @@ +# Also see http://www.opensource.org/licenses/bsd-license.html +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# This file was created by Jacques Savoy and is distributed under the BSD license. +# Note: by default this file also contains forms normalized by HindiNormalizer +# for spelling variation (see section below), such that it can be used whether or +# not you enable that feature. When adding additional entries to this list, +# please add the normalized form as well. +अंदर +अत +अपना +अपनी +अपने +अभी +आदि +आप +इत्यादि +इन +इनका +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हों +उस +उसके +उसी +उसे +एक +एवं +एस +ऐसे +और +कई +कर +करता +करते +करना +करने +करें +कहते +कहा +का +काफ़ी +कि +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गया +घर +जब +जहाँ +जा +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसे +जीधर +जैसा +जैसे +जो +तक +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तो +था +थी +थे +दबारा +दिया +दुसरा +दूसरे +दो +द्वारा +न +नहीं +ना +निहायत +नीचे +ने +पर +पर +पहले +पूरा +पे +फिर +बनी +बही +बहुत +बाद +बाला +बिलकुल +भी +भीतर +मगर +मानो +मे +में +यदि +यह +यहाँ +यही +या +यिह +ये +रखें +रहा +रहे +ऱ्वासा +लिए +लिये +लेकिन +व +वर्ग +वह +वह +वहाँ +वहीं +वाले +वुह +वे +वग़ैरह +संग +सकता +सकते +सबसे +सभी +साथ +साबुत +साभ +सारा +से +सो +ही +हुआ +हुई +हुए +है +हैं +हो +होता +होती +होते +होना +होने +# additional normalized forms of the above +अपनि +जेसे +होति +सभि +तिंहों +इंहों +दवारा +इसि +किंहें +थि +उंहों +ओर +जिंहें +वहिं +अभि +बनि +हि +उंहिं +उंहें +हें +वगेरह +एसे +रवासा +कोन +निचे +काफि +उसि +पुरा +भितर +हे +बहि +वहां +कोइ +यहां +जिंहों +तिंहें +किसि +कइ +यहि +इंहिं +जिधर +इंहें +अदि +इतयादि +हुइ +कोनसा +इसकि +दुसरे +जहां +अप +किंहों +उनकि +भि +वरग +हुअ +जेसा +नहिं diff --git a/defaults/solr/lang/stopwords_hu.txt b/defaults/solr/lang/stopwords_hu.txt new file mode 100644 index 000000000..1a96f1db6 --- /dev/null +++ b/defaults/solr/lang/stopwords_hu.txt @@ -0,0 +1,209 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + +| Hungarian stop word list +| prepared by Anna Tordai + +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +ő +ők +őket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/defaults/solr/lang/stopwords_hy.txt b/defaults/solr/lang/stopwords_hy.txt new file mode 100644 index 000000000..60c1c50fb --- /dev/null +++ b/defaults/solr/lang/stopwords_hy.txt @@ -0,0 +1,46 @@ +# example set of Armenian stopwords. +այդ +այլ +այն +այս +դու +դուք +եմ +են +ենք +ես +եք +է +էի +էին +էինք +էիր +էիք +էր +ըստ +թ +ի +ին +իսկ +իր +կամ +համար +հետ +հետո +մենք +մեջ +մի +ն +նա +նաև +նրա +նրանք +որ +որը +որոնք +որպես +ու +ում +պիտի +վրա +և diff --git a/defaults/solr/lang/stopwords_id.txt b/defaults/solr/lang/stopwords_id.txt new file mode 100644 index 000000000..4617f83a5 --- /dev/null +++ b/defaults/solr/lang/stopwords_id.txt @@ -0,0 +1,359 @@ +# from appendix D of: A Study of Stemming Effects on Information +# Retrieval in Bahasa Indonesia +ada +adanya +adalah +adapun +agak +agaknya +agar +akan +akankah +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +diantaranya +antara +antaranya +diantara +apa +apaan +mengapa +apabila +apakah +apalagi +apatah +atau +ataukah +ataupun +bagai +bagaikan +sebagai +sebagainya +bagaimana +bagaimanapun +sebagaimana +bagaimanakah +bagi +bahkan +bahwa +bahwasanya +sebaliknya +banyak +sebanyak +beberapa +seberapa +begini +beginian +beginikah +beginilah +sebegini +begitu +begitukah +begitulah +begitupun +sebegitu +belum +belumlah +sebelum +sebelumnya +sebenarnya +berapa +berapakah +berapalah +berapapun +betulkah +sebetulnya +biasa +biasanya +bila +bilakah +bisa +bisakah +sebisanya +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +cuma +percuma +dahulu +dalam +dan +dapat +dari +daripada +dekat +demi +demikian +demikianlah +sedemikian +dengan +depan +di +dia +dialah +dini +diri +dirinya +terdiri +dong +dulu +enggak +enggaknya +entah +entahlah +terhadap +terhadapnya +hal +hampir +hanya +hanyalah +harus +haruslah +harusnya +seharusnya +hendak +hendaklah +hendaknya +hingga +sehingga +ia +ialah +ibarat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jangan +jangankan +janganlah +jika +jikalau +juga +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +dikarenakan +karena +karenanya +ke +kecil +kemudian +kenapa +kepada +kepadanya +ketika +seketika +khususnya +kini +kinilah +kiranya +sekiranya +kita +kitalah +kok +lagi +lagian +selagi +lah +lain +lainnya +melainkan +selaku +lalu +melalui +terlalu +lama +lamanya +selama +selama +selamanya +lebih +terlebih +bermacam +macam +semacam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masih +masihkah +semasih +masing +mau +maupun +semaunya +memang +mereka +merekalah +meski +meskipun +semula +mungkin +mungkinkah +nah +namun +nanti +nantinya +nyaris +oleh +olehnya +seorang +seseorang +pada +padanya +padahal +paling +sepanjang +pantas +sepantasnya +sepantasnyalah +para +pasti +pastilah +per +pernah +pula +pun +merupakan +rupanya +serupa +saat +saatnya +sesaat +saja +sajalah +saling +bersama +sama +sesama +sambil +sampai +sana +sangat +sangatlah +saya +sayalah +se +sebab +sebabnya +sebuah +tersebut +tersebutlah +sedang +sedangkan +sedikit +sedikitnya +segala +segalanya +segera +sesegera +sejak +sejenak +sekali +sekalian +sekalipun +sesekali +sekaligus +sekarang +sekarang +sekitar +sekitarnya +sela +selain +selalu +seluruh +seluruhnya +semakin +sementara +sempat +semua +semuanya +sendiri +sendirinya +seolah +seperti +sepertinya +sering +seringnya +serta +siapa +siapakah +siapapun +disini +disinilah +sini +sinilah +sesuatu +sesuatunya +suatu +sesudah +sesudahnya +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tak +tanpa +setelah +telah +tentang +tentu +tentulah +tentunya +tertentu +seterusnya +tapi +tetapi +setiap +tiap +setidaknya +tidak +tidakkah +tidaklah +toh +waduh +wah +wahai +sewaktu +walau +walaupun +wong +yaitu +yakni +yang diff --git a/defaults/solr/lang/stopwords_it.txt b/defaults/solr/lang/stopwords_it.txt new file mode 100644 index 000000000..4cb5b0891 --- /dev/null +++ b/defaults/solr/lang/stopwords_it.txt @@ -0,0 +1,301 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | An Italian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +ad | a (to) before vowel +al | a + il +allo | a + lo +ai | a + i +agli | a + gli +all | a + l' +agl | a + gl' +alla | a + la +alle | a + le +con | with +col | con + il +coi | con + i (forms collo, cogli etc are now very rare) +da | from +dal | da + il +dallo | da + lo +dai | da + i +dagli | da + gli +dall | da + l' +dagl | da + gll' +dalla | da + la +dalle | da + le +di | of +del | di + il +dello | di + lo +dei | di + i +degli | di + gli +dell | di + l' +degl | di + gl' +della | di + la +delle | di + le +in | in +nel | in + el +nello | in + lo +nei | in + i +negli | in + gli +nell | in + l' +negl | in + gl' +nella | in + la +nelle | in + le +su | on +sul | su + il +sullo | su + lo +sui | su + i +sugli | su + gli +sull | su + l' +sugl | su + gl' +sulla | su + la +sulle | su + le +per | through, by +tra | among +contro | against +io | I +tu | thou +lui | he +lei | she +noi | we +voi | you +loro | they +mio | my +mia | +miei | +mie | +tuo | +tua | +tuoi | thy +tue | +suo | +sua | +suoi | his, her +sue | +nostro | our +nostra | +nostri | +nostre | +vostro | your +vostra | +vostri | +vostre | +mi | me +ti | thee +ci | us, there +vi | you, there +lo | him, the +la | her, the +li | them +le | them, the +gli | to him, the +ne | from there etc +il | the +un | a +uno | a +una | a +ma | but +ed | and +se | if +perché | why, because +anche | also +come | how +dov | where (as dov') +dove | where +che | who, that +chi | who +cui | whom +non | not +più | more +quale | who, that +quanto | how much +quanti | +quanta | +quante | +quello | that +quelli | +quella | +quelle | +questo | this +questi | +questa | +queste | +si | yes +tutto | all +tutti | all + + | single letter forms: + +a | at +c | as c' for ce or ci +e | and +i | the +l | as l' +o | or + + | forms of avere, to have (not including the infinitive): + +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute + + | forms of essere, to be (not including the infinitive): +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo + + | forms of fare, to do (not including the infinitive, fa, fat-): +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo + + | forms of stare, to be (not including the infinitive): +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/defaults/solr/lang/stopwords_ja.txt b/defaults/solr/lang/stopwords_ja.txt new file mode 100644 index 000000000..d4321be6b --- /dev/null +++ b/defaults/solr/lang/stopwords_ja.txt @@ -0,0 +1,127 @@ +# +# This file defines a stopword set for Japanese. +# +# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. +# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 +# for frequency lists, etc. that can be useful for making your own set (if desired) +# +# Note that there is an overlap between these stopwords and the terms stopped when used +# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note +# that comments are not allowed on the same line as stopwords. +# +# Also note that stopping is done in a case-insensitive manner. Change your StopFilter +# configuration if you need case-sensitive stopping. Lastly, note that stopping is done +# using the same character width as the entries in this file. Since this StopFilter is +# normally done after a CJKWidthFilter in your chain, you would usually want your romaji +# entries to be in half-width and your kana entries to be in full-width. +# +の +に +は +を +た +が +で +て +と +し +れ +さ +ある +いる +も +する +から +な +こと +として +い +や +れる +など +なっ +ない +この +ため +その +あっ +よう +また +もの +という +あり +まで +られ +なる +へ +か +だ +これ +によって +により +おり +より +による +ず +なり +られる +において +ば +なかっ +なく +しかし +について +せ +だっ +その後 +できる +それ +う +ので +なお +のみ +でき +き +つ +における +および +いう +さらに +でも +ら +たり +その他 +に関する +たち +ます +ん +なら +に対して +特に +せる +及び +これら +とき +では +にて +ほか +ながら +うち +そして +とともに +ただし +かつて +それぞれ +または +お +ほど +ものの +に対する +ほとんど +と共に +といった +です +とも +ところ +ここ +##### End of file diff --git a/defaults/solr/lang/stopwords_lv.txt b/defaults/solr/lang/stopwords_lv.txt new file mode 100644 index 000000000..e21a23c06 --- /dev/null +++ b/defaults/solr/lang/stopwords_lv.txt @@ -0,0 +1,172 @@ +# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins +# the original list of over 800 forms was refined: +# pronouns, adverbs, interjections were removed +# +# prepositions +aiz +ap +ar +apakš +ārpus +augšpus +bez +caur +dēļ +gar +iekš +iz +kopš +labad +lejpus +līdz +no +otrpus +pa +par +pār +pēc +pie +pirms +pret +priekš +starp +šaipus +uz +viņpus +virs +virspus +zem +apakšpus +# Conjunctions +un +bet +jo +ja +ka +lai +tomēr +tikko +turpretī +arī +kaut +gan +tādēļ +tā +ne +tikvien +vien +kā +ir +te +vai +kamēr +# Particles +ar +diezin +droši +diemžēl +nebūt +ik +it +taču +nu +pat +tiklab +iekšpus +nedz +tik +nevis +turpretim +jeb +iekam +iekām +iekāms +kolīdz +līdzko +tiklīdz +jebšu +tālab +tāpēc +nekā +itin +jā +jau +jel +nē +nezin +tad +tikai +vis +tak +iekams +vien +# modal verbs +būt +biju +biji +bija +bijām +bijāt +esmu +esi +esam +esat +būšu +būsi +būs +būsim +būsiet +tikt +tiku +tiki +tika +tikām +tikāt +tieku +tiec +tiek +tiekam +tiekat +tikšu +tiks +tiksim +tiksiet +tapt +tapi +tapāt +topat +tapšu +tapsi +taps +tapsim +tapsiet +kļūt +kļuvu +kļuvi +kļuva +kļuvām +kļuvāt +kļūstu +kļūsti +kļūst +kļūstam +kļūstat +kļūšu +kļūsi +kļūs +kļūsim +kļūsiet +# verbs +varēt +varēju +varējām +varēšu +varēsim +var +varēji +varējāt +varēsi +varēsiet +varat +varēja +varēs diff --git a/defaults/solr/lang/stopwords_nl.txt b/defaults/solr/lang/stopwords_nl.txt new file mode 100644 index 000000000..f4d61f509 --- /dev/null +++ b/defaults/solr/lang/stopwords_nl.txt @@ -0,0 +1,117 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other diff --git a/defaults/solr/lang/stopwords_no.txt b/defaults/solr/lang/stopwords_no.txt new file mode 100644 index 000000000..e76f36e69 --- /dev/null +++ b/defaults/solr/lang/stopwords_no.txt @@ -0,0 +1,192 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +som | who/which/that +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +være | to be +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ikkje | not * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +si | his/hers * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + diff --git a/defaults/solr/lang/stopwords_pt.txt b/defaults/solr/lang/stopwords_pt.txt new file mode 100644 index 000000000..276c1b446 --- /dev/null +++ b/defaults/solr/lang/stopwords_pt.txt @@ -0,0 +1,251 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | of, from +a | the; to, at; her +o | the; him +que | who, that +e | and +do | de + o +da | de + a +em | in +um | a +para | for + | é from SER +com | with +não | not, no +uma | a +os | the; them +no | em + o +se | himself etc +na | em + a +por | for +mais | more +as | the; them +dos | de + os +como | as, like +mas | but + | foi from SER +ao | a + o +ele | he +das | de + as + | tem from TER +à | a + a +seu | his +sua | her +ou | or + | ser from SER +quando | when +muito | much + | há from HAV +nos | em + os; us +já | already, now + | está from EST +eu | I +também | also +só | only, just +pelo | per + o +pela | per + a +até | up to +isso | that +ela | he +entre | between + | era from SER +depois | after +sem | without +mesmo | same +aos | a + os + | ter from TER +seus | his +quem | whom +nas | em + as +me | me +esse | that +eles | they + | estão from EST +você | you + | tinha from TER + | foram from SER +essa | that +num | em + um +nem | nor +suas | her +meu | my +às | a + as +minha | my + | têm from TER +numa | em + uma +pelos | per + os +elas | they + | havia from HAV + | seja from SER +qual | which + | será from SER +nós | we + | tenho from TER +lhe | to him, her +deles | of them +essas | those +esses | those +pelas | per + as +este | this + | fosse from SER +dele | of him + + | other words. There are many contractions such as naquele = em+aquele, + | mo = me+o, but they are rare. + | Indefinite article plural forms are also rare. + +tu | thou +te | thee +vocês | you (plural) +vos | you +lhes | to them +meus | my +minhas +teu | thy +tua +teus +tuas +nosso | our +nossa +nossos +nossas + +dela | of her +delas | of them + +esta | this +estes | these +estas | these +aquele | that +aquela | that +aqueles | those +aquelas | those +isto | this +aquilo | that + + | forms of estar, to be (not including the infinitive): +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem + + | forms of haver, to have (not including the infinitive): +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam + + | forms of ser, to be (not including the infinitive): +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam + + | forms of ter, to have (not including the infinitive): +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/defaults/solr/lang/stopwords_ro.txt b/defaults/solr/lang/stopwords_ro.txt new file mode 100644 index 000000000..4fdee90a5 --- /dev/null +++ b/defaults/solr/lang/stopwords_ro.txt @@ -0,0 +1,233 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +acea +aceasta +această +aceea +acei +aceia +acel +acela +acele +acelea +acest +acesta +aceste +acestea +aceşti +aceştia +acolo +acum +ai +aia +aibă +aici +al +ăla +ale +alea +ălea +altceva +altcineva +am +ar +are +aş +aşadar +asemenea +asta +ăsta +astăzi +astea +ăstea +ăştia +asupra +aţi +au +avea +avem +aveţi +azi +bine +bucur +bună +ca +că +căci +când +care +cărei +căror +cărui +cât +câte +câţi +către +câtva +ce +cel +ceva +chiar +cînd +cine +cineva +cît +cîte +cîţi +cîtva +contra +cu +cum +cumva +curând +curînd +da +dă +dacă +dar +datorită +de +deci +deja +deoarece +departe +deşi +din +dinaintea +dintr +dintre +drept +după +ea +ei +el +ele +eram +este +eşti +eu +face +fără +fi +fie +fiecare +fii +fim +fiţi +iar +ieri +îi +îl +îmi +împotriva +în +înainte +înaintea +încât +încît +încotro +între +întrucât +întrucît +îţi +la +lângă +le +li +lîngă +lor +lui +mă +mâine +mea +mei +mele +mereu +meu +mi +mine +mult +multă +mulţi +ne +nicăieri +nici +nimeni +nişte +noastră +noastre +noi +noştri +nostru +nu +ori +oricând +oricare +oricât +orice +oricînd +oricine +oricît +oricum +oriunde +până +pe +pentru +peste +pînă +poate +pot +prea +prima +primul +prin +printr +sa +să +săi +sale +sau +său +se +şi +sînt +sîntem +sînteţi +spre +sub +sunt +suntem +sunteţi +ta +tăi +tale +tău +te +ţi +ţie +tine +toată +toate +tot +toţi +totuşi +tu +un +una +unde +undeva +unei +unele +uneori +unor +vă +vi +voastră +voastre +voi +voştri +vostru +vouă +vreo +vreun diff --git a/defaults/solr/lang/stopwords_ru.txt b/defaults/solr/lang/stopwords_ru.txt new file mode 100644 index 000000000..643076934 --- /dev/null +++ b/defaults/solr/lang/stopwords_ru.txt @@ -0,0 +1,241 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter `ё' is translated to `е'. + +и | and +в | in/into +во | alternative form +не | not +что | what/that +он | he +на | on/onto +я | i +с | from +со | alternative form +как | how +а | milder form of `no' (but) +то | conjunction and form of `that' +все | all +она | she +так | so, thus +его | him +но | but +да | yes/and +ты | thou +к | towards, by +у | around, chez +же | intensifier particle +вы | you +за | beyond, behind +бы | conditional/subj. particle +по | up to, along +только | only +ее | her +мне | to me +было | it was +вот | here is/are, particle +от | away from +меня | me +еще | still, yet, more +нет | no, there isnt/arent +о | about +из | out of +ему | to him +теперь | now +когда | when +даже | even +ну | so, well +вдруг | suddenly +ли | interrogative particle +если | if +уже | already, but homonym of `narrower' +или | or +ни | neither +быть | to be +был | he was +него | prepositional form of его +до | up to +вас | you accusative +нибудь | indef. suffix preceded by hyphen +опять | again +уж | already, but homonym of `adder' +вам | to you +сказал | he said +ведь | particle `after all' +там | there +потом | then +себя | oneself +ничего | nothing +ей | to her +может | usually with `быть' as `maybe' +они | they +тут | here +где | where +есть | there is/are +надо | got to, must +ней | prepositional form of ей +для | for +мы | we +тебя | thee +их | them, their +чем | than +была | she was +сам | self +чтоб | in order to +без | without +будто | as if +человек | man, person, one +чего | genitive form of `what' +раз | once +тоже | also +себе | to oneself +под | beneath +жизнь | life +будет | will be +ж | short form of intensifer particle `же' +тогда | then +кто | who +этот | this +говорил | was saying +того | genitive form of `that' +потому | for that reason +этого | genitive form of `this' +какой | which +совсем | altogether +ним | prepositional form of `его', `они' +здесь | here +этом | prepositional form of `этот' +один | one +почти | almost +мой | my +тем | instrumental/dative plural of `тот', `то' +чтобы | full form of `in order that' +нее | her (acc.) +кажется | it seems +сейчас | now +были | they were +куда | where to +зачем | why +сказать | to say +всех | all (acc., gen. preposn. plural) +никогда | never +сегодня | today +можно | possible, one can +при | by +наконец | finally +два | two +об | alternative form of `о', about +другой | another +хоть | even +после | after +над | above +больше | more +тот | that one (masc.) +через | across, in +эти | these +нас | us +про | about +всего | in all, only, of all +них | prepositional form of `они' (they) +какая | which, feminine +много | lots +разве | interrogative particle +сказала | she said +три | three +эту | this, acc. fem. sing. +моя | my, feminine +впрочем | moreover, besides +хорошо | good +свою | ones own, acc. fem. sing. +этой | oblique form of `эта', fem. `this' +перед | in front of +иногда | sometimes +лучше | better +чуть | a little +том | preposn. form of `that one' +нельзя | one must not +такой | such a one +им | to them +более | more +всегда | always +конечно | of course +всю | acc. fem. sing of `all' +между | between + + + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs `to be', `to have', `to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя + diff --git a/defaults/solr/lang/stopwords_sv.txt b/defaults/solr/lang/stopwords_sv.txt new file mode 100644 index 000000000..22bddfd8c --- /dev/null +++ b/defaults/solr/lang/stopwords_sv.txt @@ -0,0 +1,131 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitta | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + diff --git a/defaults/solr/lang/stopwords_th.txt b/defaults/solr/lang/stopwords_th.txt new file mode 100644 index 000000000..07f0fabe6 --- /dev/null +++ b/defaults/solr/lang/stopwords_th.txt @@ -0,0 +1,119 @@ +# Thai stopwords from: +# "Opinion Detection in Thai Political News Columns +# Based on Subjectivity Analysis" +# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak +ไว้ +ไม่ +ไป +ได้ +ให้ +ใน +โดย +แห่ง +แล้ว +และ +แรก +แบบ +แต่ +เอง +เห็น +เลย +เริ่ม +เรา +เมื่อ +เพื่อ +เพราะ +เป็นการ +เป็น +เปิดเผย +เปิด +เนื่องจาก +เดียวกัน +เดียว +เช่น +เฉพาะ +เคย +เข้า +เขา +อีก +อาจ +อะไร +ออก +อย่าง +อยู่ +อยาก +หาก +หลาย +หลังจาก +หลัง +หรือ +หนึ่ง +ส่วน +ส่ง +สุด +สําหรับ +ว่า +วัน +ลง +ร่วม +ราย +รับ +ระหว่าง +รวม +ยัง +มี +มาก +มา +พร้อม +พบ +ผ่าน +ผล +บาง +น่า +นี้ +นํา +นั้น +นัก +นอกจาก +ทุก +ที่สุด +ที่ +ทําให้ +ทํา +ทาง +ทั้งนี้ +ทั้ง +ถ้า +ถูก +ถึง +ต้อง +ต่างๆ +ต่าง +ต่อ +ตาม +ตั้งแต่ +ตั้ง +ด้าน +ด้วย +ดัง +ซึ่ง +ช่วง +จึง +จาก +จัด +จะ +คือ +ความ +ครั้ง +คง +ขึ้น +ของ +ขอ +ขณะ +ก่อน +ก็ +การ +กับ +กัน +กว่า +กล่าว diff --git a/defaults/solr/lang/stopwords_tr.txt b/defaults/solr/lang/stopwords_tr.txt new file mode 100644 index 000000000..84d9408d4 --- /dev/null +++ b/defaults/solr/lang/stopwords_tr.txt @@ -0,0 +1,212 @@ +# Turkish stopwords from LUCENE-559 +# merged with the list from "Information Retrieval on Turkish Texts" +# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) +acaba +altmış +altı +ama +ancak +arada +aslında +ayrıca +bana +bazı +belki +ben +benden +beni +benim +beri +beş +bile +bin +bir +birçok +biri +birkaç +birkez +birşey +birşeyi +biz +bize +bizden +bizi +bizim +böyle +böylece +bu +buna +bunda +bundan +bunlar +bunları +bunların +bunu +bunun +burada +çok +çünkü +da +daha +dahi +de +defa +değil +diğer +diye +doksan +dokuz +dolayı +dolayısıyla +dört +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +eğer +elli +en +etmesi +etti +ettiği +ettiğini +gibi +göre +halen +hangi +hatta +hem +henüz +hep +hepsi +her +herhangi +herkesin +hiç +hiçbir +için +iki +ile +ilgili +ise +işte +itibaren +itibariyle +kadar +karşın +katrilyon +kendi +kendilerine +kendini +kendisi +kendisine +kendisini +kez +ki +kim +kimden +kime +kimi +kimse +kırk +milyar +milyon +mu +mü +mı +nasıl +ne +neden +nedenle +nerde +nerede +nereye +niye +niçin +o +olan +olarak +oldu +olduğu +olduğunu +olduklarını +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor +on +ona +ondan +onlar +onlardan +onları +onların +onu +onun +otuz +oysa +öyle +pek +rağmen +sadece +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +şey +şeyden +şeyi +şeyler +şöyle +şu +şuna +şunda +şundan +şunları +şunu +tarafından +trilyon +tüm +üç +üzere +var +vardı +ve +veya +ya +yani +yapacak +yapılan +yapılması +yapıyor +yapmak +yaptı +yaptığı +yaptığını +yaptıkları +yedi +yerine +yetmiş +yine +yirmi +yoksa +yüz +zaten diff --git a/defaults/solr/protwords.txt b/defaults/solr/protwords.txt new file mode 100644 index 000000000..1dfc0abec --- /dev/null +++ b/defaults/solr/protwords.txt @@ -0,0 +1,21 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# Use a protected word file to protect against the stemmer reducing two +# unrelated words to the same base word. + +# Some non-words that normally won't be encountered, +# just to test that they won't be stemmed. +dontstems +zwhacky + diff --git a/defaults/solr/schema.xml b/defaults/solr/schema.xml new file mode 100755 index 000000000..52b36042a --- /dev/null +++ b/defaults/solr/schema.xml @@ -0,0 +1,1012 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + iddiff --git a/defaults/solr/solr.xml b/defaults/solr/solr.xml new file mode 100644 index 000000000..6d4d9f0e1 --- /dev/null +++ b/defaults/solr/solr.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/defaults/solr/solrconfig.xml b/defaults/solr/solrconfig.xml new file mode 100755 index 000000000..5f50ab580 --- /dev/null +++ b/defaults/solr/solrconfig.xml @@ -0,0 +1,1622 @@ + + + + + + + + + ${solr.abortOnConfigurationError:true} + + + LUCENE_36 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1024 + + + + + + + + + + + + + + + + + + + + + + true + + + + + + 20 + + + 200 + + + + + + + + + + + + static firstSearcher warming in solrconfig.xml + + + + + + false + + + 2 + + + + + + + + + + + + + + + + + + + + + + + explicit + 10 + text + + + + + + + + + + + + + + explicit + + + velocity + + browse + layout + Solritas + + text + edismax + *:* + 10 + *,score + + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + + text,features,name,sku,id,manu,cat + 3 + + + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + + + on + cat + manu_exact + ipod + GB + 1 + cat,inStock + after + price + 0 + 600 + 50 + popularity + 0 + 10 + 3 + manufacturedate_dt + NOW/YEAR-10YEARS + NOW + +1YEAR + before + after + + + + on + text features name + 0 + name + + + spellcheck + + + + + + + + + + + + + + + + + + + + + + + text + true + ignored_ + + + true + links + ignored_ + + + + + + + + + + + + + + + + + + + + + + + + solrpingquery + + + all + + + + + + + explicit + true + + + + + + + + + + + + textSpell + + + + + + default + name + spellchecker + + + + + + + + + + + + + + + + + + text + false + false + 1 + + + spellcheck + + + + + + + + + + text + true + + + tvComponent + + + + + + + + + default + + + org.carrot2.clustering.lingo.LingoClusteringAlgorithm + + + 20 + + + clustering/carrot2 + + + ENGLISH + + + stc + org.carrot2.clustering.stc.STCClusteringAlgorithm + + + + + + + true + default + true + + name + id + + features + + true + + + + false + + text + edismax + + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 + + *:* + 10 + *,score + + + clustering + + + + + + + + + + true + + + terms + + + + + + + + string + elevate.xml + + + + + + explicit + text + + + elevator + + + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + + + ,, + ,, + ,, + ,, + ,]]> + ]]> + + + + + + 10 + .,!? + + + + + + + WORD + + en + US + + + + + + + + + + + + + + + + + + + text/plain; charset=UTF-8 + + + + + + + + + 5 + + + + + + + + + + + + + *:* + + + + + + diff --git a/defaults/solr/stopwords.txt b/defaults/solr/stopwords.txt new file mode 100644 index 000000000..ae1e83eeb --- /dev/null +++ b/defaults/solr/stopwords.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/defaults/solr/synonyms.txt b/defaults/solr/synonyms.txt new file mode 100644 index 000000000..7f7212830 --- /dev/null +++ b/defaults/solr/synonyms.txt @@ -0,0 +1,29 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaafoo => aaabar +bbbfoo => bbbfoo bbbbar +cccfoo => cccbar cccbaz +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma + diff --git a/htroot/BlogComments.java b/htroot/BlogComments.java index 048e066e5..8e9097bab 100644 --- a/htroot/BlogComments.java +++ b/htroot/BlogComments.java @@ -39,9 +39,11 @@ import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.FileUtils; import net.yacy.peers.Network; import net.yacy.search.Switchboard; + +import com.google.common.io.Files; + import de.anomic.data.BlogBoard; import de.anomic.data.BlogBoard.BlogEntry; import de.anomic.data.BlogBoardComments; @@ -53,7 +55,7 @@ import de.anomic.server.serverSwitch; public class BlogComments { private static final String DEFAULT_PAGE = "blog_default"; - + public static String dateString(final Date date) { return Blog.dateString(date); } @@ -142,7 +144,7 @@ public class BlogComments { final File notifierSource = new File(sb.getAppPath(), sb.getConfig("htRootPath","htroot") + "/env/grafics/message.gif"); final File notifierDest = new File(sb.getDataPath("htDocsPath", "DATA/HTDOCS"), "notifier.gif"); try { - FileUtils.copy(notifierSource, notifierDest); + Files.copy(notifierSource, notifierDest); } catch (final IOException e) { Log.logSevere("MESSAGE", "NEW MESSAGE ARRIVED! (error: " + e.getMessage() + ")"); diff --git a/htroot/ConfigAppearance_p.java b/htroot/ConfigAppearance_p.java index 1bddfa01b..331c11c5f 100644 --- a/htroot/ConfigAppearance_p.java +++ b/htroot/ConfigAppearance_p.java @@ -1,4 +1,4 @@ -// ConfigAppearance_p.java +// ConfigAppearance_p.java // ----------------------- // part of YaCy // (C) by Michael Peter Christen; mc@yacy.net @@ -14,7 +14,7 @@ //$LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -34,6 +34,7 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; +import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -44,9 +45,10 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; +import com.google.common.io.Files; + import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -import java.util.Collections; public class ConfigAppearance_p { @@ -77,7 +79,7 @@ public class ConfigAppearance_p { if (skinFiles.contains(selectedSkin)) { changeSkin(sb, skinPath, selectedSkin); } - + } if (post.containsKey("delete_button")) { @@ -91,7 +93,7 @@ public class ConfigAppearance_p { FileUtils.deletedelete(skinfile); } } - + if (post.containsKey("install_button")) { // load skin from URL final String url = post.get("url"); @@ -112,7 +114,7 @@ public class ConfigAppearance_p { while (it.hasNext()) { bw.write(it.next() + "\n"); } - + bw.close(); } catch (final IOException e) { prop.put("status", "2");// error saving the skin @@ -148,7 +150,7 @@ public class ConfigAppearance_p { } prop.put("skinlist", count); prop.putHTML("currentskin", env.getConfig("currentSkin", "default")); - + // write colors from generic skin Iterator i = env.configKeys(); while (i.hasNext()) { @@ -165,7 +167,7 @@ public class ConfigAppearance_p { styleFile.getParentFile().mkdirs(); try { - FileUtils.copy(skinFile, styleFile); + Files.copy(skinFile, styleFile); sb.setConfig("currentSkin", skin.substring(0, skin.length() - 4)); return true; } catch (final IOException e) { diff --git a/htroot/IndexFederated_p.html b/htroot/IndexFederated_p.html index 8d2ebdb72..9089f0029 100644 --- a/htroot/IndexFederated_p.html +++ b/htroot/IndexFederated_p.html @@ -21,8 +21,8 @@
- - + + You can just switch on or off this index. If you switch it off, you will not be able to search with YaCy any more.
@@ -30,8 +30,17 @@
- - + + + + Experimental embedded solr index. +
+ + +
+ + + You can set one or more Solr targets here. If you wish to set several targets, then list them in the 'Solr URL' field using a ',' (comma) as separator. diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index fce03a821..0d1094a4d 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -31,9 +31,9 @@ import java.util.Iterator; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.solr.SolrConnector; -import net.yacy.cora.services.federated.solr.SolrShardingConnector; -import net.yacy.cora.services.federated.solr.SolrShardingSelection; -import net.yacy.cora.services.federated.solr.SolrSingleConnector; +import net.yacy.cora.services.federated.solr.ShardSolrConnector; +import net.yacy.cora.services.federated.solr.ShardSelection; +import net.yacy.cora.services.federated.solr.SingleSolrConnector; import net.yacy.cora.storage.ConfigurationSet; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; @@ -89,7 +89,7 @@ public class IndexFederated_p { // switch on final boolean usesolr = sb.getConfigBool("federated.service.solr.indexing.enabled", false) & solrurls.length() > 0; try { - sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new SolrShardingConnector(solrurls, SolrShardingSelection.Method.MODULO_HOST_MD5, 10000, true) : null); + sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr((usesolr) ? new ShardSolrConnector(solrurls, ShardSelection.Method.MODULO_HOST_MD5, 10000, true) : null); } catch (final IOException e) { Log.logException(e); sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr(null); @@ -132,8 +132,8 @@ public class IndexFederated_p { } else { prop.put("table", 1); final SolrConnector solr = sb.indexSegments.segment(Segments.Process.LOCALCRAWLING).getSolr(); - final long[] size = (solr instanceof SolrShardingConnector) ? ((SolrShardingConnector) solr).getSizeList() : new long[]{((SolrSingleConnector) solr).getSize()}; - final String[] urls = (solr instanceof SolrShardingConnector) ? ((SolrShardingConnector) solr).getAdminInterfaceList() : new String[]{((SolrSingleConnector) solr).getAdminInterface()}; + final long[] size = (solr instanceof ShardSolrConnector) ? ((ShardSolrConnector) solr).getSizeList() : new long[]{((SingleSolrConnector) solr).getSize()}; + final String[] urls = (solr instanceof ShardSolrConnector) ? ((ShardSolrConnector) solr).getAdminInterfaceList() : new String[]{((SingleSolrConnector) solr).getAdminInterface()}; boolean dark = false; for (int i = 0; i < size.length; i++) { prop.put("table_list_" + i + "_dark", dark ? 1 : 0); dark = !dark; diff --git a/htroot/Messages_p.java b/htroot/Messages_p.java index 5af5162f6..5b182972b 100644 --- a/htroot/Messages_p.java +++ b/htroot/Messages_p.java @@ -1,4 +1,4 @@ -// Messages_p.java +// Messages_p.java // ----------------------- // part of the AnomicHTTPD caching proxy // (C) by Michael Peter Christen; mc@yacy.net @@ -35,10 +35,11 @@ import java.util.TreeMap; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.kelondro.util.FileUtils; import net.yacy.peers.Seed; import net.yacy.search.Switchboard; +import com.google.common.io.Files; + import de.anomic.data.MessageBoard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -98,7 +99,7 @@ public class Messages_p { final File notifierSource = new File(sb.getAppPath(), sb.getConfig("htRootPath", "htroot") + "/env/grafics/empty.gif"); final File notifierDest = new File(sb.getDataPath("htDocsPath", "DATA/HTDOCS"), "notifier.gif"); try { - FileUtils.copy(notifierSource, notifierDest); + Files.copy(notifierSource, notifierDest); } catch (final IOException e) { } diff --git a/htroot/yacy/message.java b/htroot/yacy/message.java index f4446f997..592951e7e 100644 --- a/htroot/yacy/message.java +++ b/htroot/yacy/message.java @@ -40,11 +40,13 @@ import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.FileUtils; -import net.yacy.peers.Seed; -import net.yacy.peers.Protocol; import net.yacy.peers.Network; +import net.yacy.peers.Protocol; +import net.yacy.peers.Seed; import net.yacy.search.Switchboard; + +import com.google.common.io.Files; + import de.anomic.data.MessageBoard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -150,7 +152,7 @@ public final class message { final File notifierSource = new File(sb.getAppPath(), sb.getConfig("htRootPath","htroot") + "/env/grafics/message.gif"); final File notifierDest = new File(sb.getDataPath("htDocsPath", "DATA/HTDOCS"), "notifier.gif"); try { - FileUtils.copy(notifierSource, notifierDest); + Files.copy(notifierSource, notifierDest); } catch (final IOException e) { Log.logSevere("MESSAGE", "NEW MESSAGE ARRIVED! (error: " + e.getMessage() + ")"); diff --git a/lib/apache-solr-core-3.6.0.jar b/lib/apache-solr-core-3.6.0.jar new file mode 100644 index 000000000..e1bb4f03c Binary files /dev/null and b/lib/apache-solr-core-3.6.0.jar differ diff --git a/lib/commons-httpclient-3.1.jar b/lib/commons-httpclient-3.1.jar new file mode 100644 index 000000000..7c59774ae Binary files /dev/null and b/lib/commons-httpclient-3.1.jar differ diff --git a/lib/commons-lang-2.6.jar b/lib/commons-lang-2.6.jar new file mode 100644 index 000000000..98467d3a6 Binary files /dev/null and b/lib/commons-lang-2.6.jar differ diff --git a/lib/dependencies.txt b/lib/dependencies.txt index ecf091210..b74ad1fc8 100644 --- a/lib/dependencies.txt +++ b/lib/dependencies.txt @@ -1,6 +1,11 @@ list of library-dependencies: -* apache-solr-solrj-3.4.0.jar depens on: +* jetty (these files were taken from the solr 3.6.0 example) +jetty-6.1.26-patched-JETTY-1340.jar +jetty-util-6.1.26-patched-JETTY-1340.jar +servlet-api-2.5-20081211.jar + +* apache-solr-solrj-3.4.0.jar depends on: commons-codec-1.4.jar commons-httpclient-3.1.jar commons-io-1.4.jar @@ -9,6 +14,29 @@ jcl-over-slf4j-1.6.1.jar slf4j-api-1.6.1.jar wstx-asl-3.2.7.jar +* apache-solr-core-3.6.0.jar depends on +commons-codec-1.6.jar +commons-fileupload-1.2.1.jar +commons-httpclient-3.1.jar +commons-httpclient-3.1.jar +commons-io-2.1.jar +commons-lang-2.6.jar +geronimo-stax-api_1.0_spec-1.0.1.jar +guava-r05.jar +httpclient-4.2.jar +httpcore-4.2.jar +jcl-over-slf4j-1.6.1.jar +log4j-over-slf4j-1.6.1.jar +lucene-analyzers-3.6.0.jar +lucene-core-3.6.0.jar +lucene-highlighter-3.6.0.jar +lucene-phonetic-3.6.0.jar +lucene-spatial-3.6.0.jar +lucene-spellchecker-3.6.0.jar +slf4j-api-1.6.1.jar +slf4j-jdk14-1.6.1.jar +wstx-asl-3.2.7.jar + * pdfbox-1.6.0.jar depends on: fontbox-1.6.0.jar jempbox-1.6.0.jar diff --git a/lib/fontbox-1.6.License b/lib/fontbox-1.7.0.License similarity index 100% rename from lib/fontbox-1.6.License rename to lib/fontbox-1.7.0.License diff --git a/lib/fontbox-1.6.0.jar b/lib/fontbox-1.7.0.jar similarity index 71% rename from lib/fontbox-1.6.0.jar rename to lib/fontbox-1.7.0.jar index c3492fc29..2f3fbe2c3 100644 Binary files a/lib/fontbox-1.6.0.jar and b/lib/fontbox-1.7.0.jar differ diff --git a/lib/guava-r05.jar b/lib/guava-r05.jar new file mode 100644 index 000000000..0407b9c01 Binary files /dev/null and b/lib/guava-r05.jar differ diff --git a/lib/jempbox-1.6.0.License b/lib/jempbox-1.7.0.License similarity index 100% rename from lib/jempbox-1.6.0.License rename to lib/jempbox-1.7.0.License diff --git a/lib/jempbox-1.6.0.jar b/lib/jempbox-1.7.0.jar similarity index 78% rename from lib/jempbox-1.6.0.jar rename to lib/jempbox-1.7.0.jar index 5ca6d7a58..770b39bd6 100644 Binary files a/lib/jempbox-1.6.0.jar and b/lib/jempbox-1.7.0.jar differ diff --git a/lib/jetty-6.1.26-patched-JETTY-1340.jar b/lib/jetty-6.1.26-patched-JETTY-1340.jar new file mode 100644 index 000000000..6be492c92 Binary files /dev/null and b/lib/jetty-6.1.26-patched-JETTY-1340.jar differ diff --git a/lib/servlet-api.License b/lib/jetty-LICENSE-ASL.txt similarity index 99% rename from lib/servlet-api.License rename to lib/jetty-LICENSE-ASL.txt index 261eeb9e9..d64569567 100644 --- a/lib/servlet-api.License +++ b/lib/jetty-LICENSE-ASL.txt @@ -1,3 +1,4 @@ + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ diff --git a/lib/jetty-util-6.1.26-patched-JETTY-1340.jar b/lib/jetty-util-6.1.26-patched-JETTY-1340.jar new file mode 100644 index 000000000..1a9ace88d Binary files /dev/null and b/lib/jetty-util-6.1.26-patched-JETTY-1340.jar differ diff --git a/lib/jetty-util-LICENSE-ASL.txt b/lib/jetty-util-LICENSE-ASL.txt new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/lib/jetty-util-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lib/log4j-over-slf4j-1.6.1.jar b/lib/log4j-over-slf4j-1.6.1.jar new file mode 100644 index 000000000..c4025f42e Binary files /dev/null and b/lib/log4j-over-slf4j-1.6.1.jar differ diff --git a/lib/lucene-analyzers-3.6.0.jar b/lib/lucene-analyzers-3.6.0.jar new file mode 100644 index 000000000..d7e5ccf77 Binary files /dev/null and b/lib/lucene-analyzers-3.6.0.jar differ diff --git a/lib/lucene-core-3.6.0.jar b/lib/lucene-core-3.6.0.jar new file mode 100644 index 000000000..5cb8dae5b Binary files /dev/null and b/lib/lucene-core-3.6.0.jar differ diff --git a/lib/lucene-highlighter-3.6.0.jar b/lib/lucene-highlighter-3.6.0.jar new file mode 100644 index 000000000..f07b95df4 Binary files /dev/null and b/lib/lucene-highlighter-3.6.0.jar differ diff --git a/lib/lucene-phonetic-3.6.0.jar b/lib/lucene-phonetic-3.6.0.jar new file mode 100644 index 000000000..c10979b07 Binary files /dev/null and b/lib/lucene-phonetic-3.6.0.jar differ diff --git a/lib/lucene-spatial-3.6.0.jar b/lib/lucene-spatial-3.6.0.jar new file mode 100644 index 000000000..c94aaad8c Binary files /dev/null and b/lib/lucene-spatial-3.6.0.jar differ diff --git a/lib/lucene-spellchecker-3.6.0.jar b/lib/lucene-spellchecker-3.6.0.jar new file mode 100644 index 000000000..3cbc48909 Binary files /dev/null and b/lib/lucene-spellchecker-3.6.0.jar differ diff --git a/lib/pdfbox-1.6.License b/lib/pdfbox-1.7.0.License similarity index 100% rename from lib/pdfbox-1.6.License rename to lib/pdfbox-1.7.0.License diff --git a/lib/pdfbox-1.6.0.jar b/lib/pdfbox-1.7.0.jar similarity index 85% rename from lib/pdfbox-1.6.0.jar rename to lib/pdfbox-1.7.0.jar index 75efe034f..9198f1047 100644 Binary files a/lib/pdfbox-1.6.0.jar and b/lib/pdfbox-1.7.0.jar differ diff --git a/lib/servlet-api-2.5-20081211.jar b/lib/servlet-api-2.5-20081211.jar new file mode 100644 index 000000000..b0537c4db Binary files /dev/null and b/lib/servlet-api-2.5-20081211.jar differ diff --git a/lib/servlet-api-LICENSE-ASL.txt b/lib/servlet-api-LICENSE-ASL.txt new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/lib/servlet-api-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lib/servlet-api.jar b/lib/servlet-api.jar deleted file mode 100644 index 308fd7a38..000000000 Binary files a/lib/servlet-api.jar and /dev/null differ diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index eb7a2631c..603e517d5 100644 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -38,7 +38,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.cora.services.federated.solr.SolrDoc; -import net.yacy.cora.services.federated.solr.SolrShardingConnector; +import net.yacy.cora.services.federated.solr.ShardSolrConnector; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.Index; @@ -114,7 +114,7 @@ public class ZURL implements Iterable { this.stack = new LinkedBlockingQueue(); } - public ZURL(final SolrShardingConnector solrConnector, + public ZURL(final ShardSolrConnector solrConnector, final SolrConfiguration solrConfiguration) { this.solrConnector = solrConnector; this.solrConfiguration = solrConfiguration; diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index f32278361..23f36aba1 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -43,7 +43,10 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -54,6 +57,10 @@ import net.yacy.cora.storage.KeyList; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.MemoryControl; +import com.google.common.net.InetAddresses; +import com.google.common.util.concurrent.SimpleTimeLimiter; +import com.google.common.util.concurrent.TimeLimiter; + public class Domains { @@ -553,14 +560,16 @@ public class Domains { cacheHit_Insert++; } + final private static TimeLimiter timeLimiter = new SimpleTimeLimiter(Executors.newFixedThreadPool(20)); + /** * resolve a host address using a local DNS cache and a DNS lookup if necessary * @param host * @return the hosts InetAddress or null if the address cannot be resolved */ - public static InetAddress dnsResolve(String host) { - if ((host == null) || (host.length() == 0)) return null; - host = host.toLowerCase().trim(); + public static InetAddress dnsResolve(final String host0) { + if ((host0 == null) || (host0.length() == 0)) return null; + final String host = host0.toLowerCase().trim(); // try to simply parse the address InetAddress ip = parseInetAddress(host); if (ip != null) return ip; @@ -615,8 +624,23 @@ public class Domains { try { //final long t = System.currentTimeMillis(); Thread.currentThread().setName("Domains: DNS resolve of '" + host + "'"); // thread dump show which host is resolved - ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone - //ip = InetAddress.getByName(host); // this makes the DNS request to backbone + if (InetAddresses.isInetAddress(host)) { + try { + ip = InetAddresses.forString(host); + Log.logInfo("Domains", "using guava for host resolution:" + host); + } catch (IllegalArgumentException e) { + ip = null; + } + } + if (ip == null) { + ip = timeLimiter.callWithTimeout(new Callable() { + @Override + public InetAddress call() throws Exception { + return InetAddress.getByName(host); + } + }, 1000L, TimeUnit.MILLISECONDS, false); + //ip = TimeoutRequest.getByName(host, 1000); // this makes the DNS request to backbone + } //.out.println("DNSLOOKUP-*LOOKUP* " + host + ", time = " + (System.currentTimeMillis() - t) + "ms"); } catch (final Throwable e) { // add new entries diff --git a/source/net/yacy/cora/protocol/TimeoutRequest.java b/source/net/yacy/cora/protocol/TimeoutRequest.java index fed518a13..071759c19 100644 --- a/source/net/yacy/cora/protocol/TimeoutRequest.java +++ b/source/net/yacy/cora/protocol/TimeoutRequest.java @@ -70,6 +70,7 @@ public class TimeoutRequest { try { final Future taskFuture = service.submit(this.call); final Runnable t = new Runnable() { + @Override public void run() { taskFuture.cancel(true); } }; service.execute(t); @@ -109,6 +110,7 @@ public class TimeoutRequest { */ public static boolean ping(final String host, final int port, final int timeout) throws ExecutionException { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { //long time = System.currentTimeMillis(); try { @@ -133,25 +135,6 @@ public class TimeoutRequest { }).call(timeout).booleanValue(); } - /** - * do a DNS lookup within a given time - * @param host - * @param timeout - * @return the InetAddress for a given domain name - * @throws ExecutionException - */ - public static InetAddress getByName(final String host, final long timeout) throws ExecutionException { - return new TimeoutRequest(new Callable() { - public InetAddress call() { - try { - return InetAddress.getByName(host); - } catch (final UnknownHostException e) { - return null; - } - } - }).call(timeout); - } - /** * perform a reverse domain name lookup for a given InetAddress within a given timeout * @param i @@ -161,6 +144,7 @@ public class TimeoutRequest { */ public static String getHostName(final InetAddress i, final long timeout) throws ExecutionException { return new TimeoutRequest(new Callable() { + @Override public String call() { return i.getHostName(); } }).call(timeout); } @@ -175,6 +159,7 @@ public class TimeoutRequest { public static boolean exists(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { try { return file.exists(); } catch (final SmbException e) { @@ -196,6 +181,7 @@ public class TimeoutRequest { public static boolean canRead(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { try { return file.canRead(); } catch (final SmbException e) { @@ -217,6 +203,7 @@ public class TimeoutRequest { public static boolean canWrite(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { try { return file.canWrite(); } catch (final SmbException e) { @@ -238,6 +225,7 @@ public class TimeoutRequest { public static boolean isHidden(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { try { return file.isHidden(); } catch (final SmbException e) { @@ -259,6 +247,7 @@ public class TimeoutRequest { public static boolean isDirectory(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Boolean call() { try { return file.isDirectory(); } catch (final SmbException e) { @@ -280,6 +269,7 @@ public class TimeoutRequest { public static long length(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Long call() { try { return file.length(); } catch (final SmbException e) { @@ -301,6 +291,7 @@ public class TimeoutRequest { public static long lastModified(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public Long call() { try { return file.lastModified(); } catch (final SmbException e) { @@ -322,6 +313,7 @@ public class TimeoutRequest { public static String[] list(final SmbFile file, final long timeout) throws IOException { try { return new TimeoutRequest(new Callable() { + @Override public String[] call() { try { return file.list(); } catch (final SmbException e) { @@ -334,11 +326,4 @@ public class TimeoutRequest { } } - public static void main(final String[] args) { - try { - System.out.println(getByName("yacy.net", 100)); - } catch (final ExecutionException e) { - e.printStackTrace(); - } - } } diff --git a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java b/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java similarity index 51% rename from source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java rename to source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java index 876c60a64..4962acdf6 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrSingleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/AbstractSolrConnector.java @@ -1,11 +1,7 @@ /** - * SolrSingleConnector - * Copyright 2011 by Michael Peter Christen - * First released 14.04.2011 at http://yacy.net - * - * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ - * $LastChangedRevision: 7654 $ - * $LastChangedBy: orbiter $ + * AbstractSolrConnector + * Copyright 2012 by Michael Peter Christen + * First released 21.06.2012 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -26,91 +22,36 @@ package net.yacy.cora.services.federated.solr; import java.io.File; import java.io.IOException; -import java.net.InetAddress; import java.util.ArrayList; import java.util.Collection; import java.util.List; -import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.protocol.Domains; import net.yacy.kelondro.logging.Log; +import net.yacy.search.index.SolrField; -import org.apache.http.HttpHost; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.AuthCache; -import org.apache.http.client.protocol.ClientContext; -import org.apache.http.impl.auth.BasicScheme; -import org.apache.http.impl.client.BasicAuthCache; -import org.apache.http.impl.client.BasicCredentialsProvider; -import org.apache.http.impl.client.DefaultHttpClient; -import org.apache.http.protocol.HttpContext; import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; -import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; -import net.yacy.search.index.SolrField; +public class AbstractSolrConnector implements SolrConnector { -public class SolrSingleConnector implements SolrConnector { + protected SolrServer server; - private final String solrurl, host, solrpath, solraccount, solrpw; - private final int port; - private HttpSolrServer server; + protected AbstractSolrConnector() { + this.server = null; + } - /** - * create a new solr connector - * @param url the solr url, like http://192.168.1.60:8983/solr/ or http://admin:pw@192.168.1.60:8983/solr/ - * @param scheme - * @throws IOException - */ - public SolrSingleConnector(final String url) throws IOException { - this.solrurl = url; + protected void init(SolrServer server) { + this.server = server; + } - // connect using authentication - final MultiProtocolURI u = new MultiProtocolURI(this.solrurl); - this.host = u.getHost(); - this.port = u.getPort(); - this.solrpath = u.getPath(); - final String userinfo = u.getUserInfo(); - if (userinfo == null || userinfo.length() == 0) { - this.solraccount = ""; this.solrpw = ""; - } else { - final int p = userinfo.indexOf(':'); - if (p < 0) { - this.solraccount = userinfo; this.solrpw = ""; - } else { - this.solraccount = userinfo.substring(0, p); this.solrpw = userinfo.substring(p + 1); - } - } - if (this.solraccount.length() > 0) { - final DefaultHttpClient client = new DefaultHttpClient() { - @Override - protected HttpContext createHttpContext() { - HttpContext context = super.createHttpContext(); - AuthCache authCache = new BasicAuthCache(); - BasicScheme basicAuth = new BasicScheme(); - HttpHost targetHost = new HttpHost(u.getHost(), u.getPort(), u.getProtocol()); - authCache.put(targetHost, basicAuth); - context.setAttribute(ClientContext.AUTH_CACHE, authCache); - return context; - } - }; - BasicCredentialsProvider credsProvider = new BasicCredentialsProvider(); - credsProvider.setCredentials(new AuthScope(this.host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(this.solraccount, this.solrpw)); - client.setCredentialsProvider(credsProvider); - this.server = new HttpSolrServer("http://" + this.host + ":" + this.port + this.solrpath, client); - } else { - this.server = new HttpSolrServer(this.solrurl); - } - this.server.setAllowCompression(true); - this.server.setConnectionTimeout(60000); - this.server.setMaxRetries(1); // Solr-Doc: No more than 1 recommended (depreciated) - this.server.setSoTimeout(60000); + public SolrServer getServer() { + return this.server; } @Override @@ -118,9 +59,9 @@ public class SolrSingleConnector implements SolrConnector { try { this.server.commit(); } catch (SolrServerException e) { - e.printStackTrace(); + Log.logException(e); } catch (IOException e) { - e.printStackTrace(); + Log.logException(e); } } @@ -251,37 +192,4 @@ public class SolrSingleConnector implements SolrConnector { //return result; } - - public String getAdminInterface() { - final InetAddress localhostExternAddress = Domains.myPublicLocalIP(); - final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress(); - String u = this.solrurl; - int p = u.indexOf("localhost",0); if (p < 0) p = u.indexOf("127.0.0.1",0); - if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9); - return u + (u.endsWith("/") ? "admin/" : "/admin/"); - } - - public static void main(final String args[]) { - SolrSingleConnector solr; - try { - solr = new SolrSingleConnector("http://127.0.0.1:8983/solr"); - solr.clear(); - final File exampleDir = new File("test/parsertest/"); - long t, t0, a = 0; - int c = 0; - System.out.println("push files in " + exampleDir.getAbsolutePath() + " to Solr"); - for (final String s: exampleDir.list()) { - if (s.startsWith(".")) continue; - t = System.currentTimeMillis(); - solr.add(new File(exampleDir, s), s); - t0 = (System.currentTimeMillis() - t); - a += t0; - c++; - System.out.println("pushed file " + s + " to solr, " + t0 + " milliseconds"); - } - System.out.println("pushed " + c + " files in " + a + " milliseconds, " + (a / c) + " milliseconds average; " + (60000 / a * c) + " PPM"); - } catch (final IOException e) { - e.printStackTrace(); - } - } } diff --git a/source/net/yacy/cora/services/federated/solr/SolrMultipleConnector.java b/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java similarity index 91% rename from source/net/yacy/cora/services/federated/solr/SolrMultipleConnector.java rename to source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java index d9b5147a3..92f7963a8 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrMultipleConnector.java +++ b/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java @@ -8,7 +8,7 @@ import java.util.concurrent.ArrayBlockingQueue; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; -public class SolrMultipleConnector implements SolrConnector { +public class MultipleSolrConnector implements SolrConnector { private final static SolrDoc POISON_DOC = new SolrDoc(); @@ -16,8 +16,8 @@ public class SolrMultipleConnector implements SolrConnector { private final AddWorker[] worker; private final SolrConnector solr; - public SolrMultipleConnector(final String url, int connections) throws IOException { - this.solr = new SolrSingleConnector(url); + public MultipleSolrConnector(final String url, int connections) throws IOException { + this.solr = new SingleSolrConnector(url); this.queue = new ArrayBlockingQueue(1000); this.worker = new AddWorker[connections]; for (int i = 0; i < connections; i++) { @@ -29,13 +29,13 @@ public class SolrMultipleConnector implements SolrConnector { private class AddWorker extends Thread { private final SolrConnector solr; public AddWorker(final String url) throws IOException { - this.solr = new SolrSingleConnector(url); + this.solr = new SingleSolrConnector(url); } @Override public void run() { SolrDoc doc; try { - while ((doc = SolrMultipleConnector.this.queue.take()) != POISON_DOC) { + while ((doc = MultipleSolrConnector.this.queue.take()) != POISON_DOC) { try { this.solr.add(doc); } catch (SolrException e) { diff --git a/source/net/yacy/cora/services/federated/solr/SolrRetryConnector.java b/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java similarity index 97% rename from source/net/yacy/cora/services/federated/solr/SolrRetryConnector.java rename to source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java index b28134952..74787b94b 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrRetryConnector.java +++ b/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java @@ -31,12 +31,12 @@ import java.util.List; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; -public class SolrRetryConnector implements SolrConnector { +public class RetrySolrConnector implements SolrConnector { private final SolrConnector solrConnector; private final long retryMaxTime; - public SolrRetryConnector(final SolrConnector solrConnector, final long retryMaxTime) { + public RetrySolrConnector(final SolrConnector solrConnector, final long retryMaxTime) { this.solrConnector = solrConnector; this.retryMaxTime = retryMaxTime; } diff --git a/source/net/yacy/cora/services/federated/solr/SolrShardingSelection.java b/source/net/yacy/cora/services/federated/solr/ShardSelection.java similarity index 96% rename from source/net/yacy/cora/services/federated/solr/SolrShardingSelection.java rename to source/net/yacy/cora/services/federated/solr/ShardSelection.java index ae86a3411..c303a56db 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrShardingSelection.java +++ b/source/net/yacy/cora/services/federated/solr/ShardSelection.java @@ -33,7 +33,7 @@ import java.security.NoSuchAlgorithmException; import java.util.concurrent.atomic.AtomicLong; import net.yacy.search.index.SolrField; -public class SolrShardingSelection { +public class ShardSelection { public final static Charset charsetUTF8; static { @@ -47,7 +47,7 @@ public class SolrShardingSelection { MODULO_HOST_MD5, ROUND_ROBIN; } - public SolrShardingSelection(final Method method, final int dimension) { + public ShardSelection(final Method method, final int dimension) { this.method = method; this.dimension = dimension; this.chardID = new AtomicLong(0); diff --git a/source/net/yacy/cora/services/federated/solr/SolrShardingConnector.java b/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java similarity index 91% rename from source/net/yacy/cora/services/federated/solr/SolrShardingConnector.java rename to source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java index 0249165f2..ed8de4cdf 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrShardingConnector.java +++ b/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java @@ -37,22 +37,22 @@ import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; -public class SolrShardingConnector implements SolrConnector { +public class ShardSolrConnector implements SolrConnector { private final List connectors; - private final SolrShardingSelection sharding; + private final ShardSelection sharding; private final String[] urls; - public SolrShardingConnector(final String urlList, final SolrShardingSelection.Method method, final long timeout, boolean multipleConnections) throws IOException { + public ShardSolrConnector(final String urlList, final ShardSelection.Method method, final long timeout, boolean multipleConnections) throws IOException { urlList.replace(' ', ','); this.urls = urlList.split(","); this.connectors = new ArrayList(); SolrConnector s; for (final String u: this.urls) { - s = multipleConnections ? new SolrMultipleConnector(u.trim(), 2) : new SolrSingleConnector(u.trim()); - this.connectors.add(new SolrRetryConnector(s, timeout)); + s = multipleConnections ? new MultipleSolrConnector(u.trim(), 2) : new SingleSolrConnector(u.trim()); + this.connectors.add(new RetrySolrConnector(s, timeout)); } - this.sharding = new SolrShardingSelection(method, this.urls.length); + this.sharding = new ShardSelection(method, this.urls.length); } @Override diff --git a/source/net/yacy/cora/services/federated/solr/SingleSolrConnector.java b/source/net/yacy/cora/services/federated/solr/SingleSolrConnector.java new file mode 100644 index 000000000..0e97600d9 --- /dev/null +++ b/source/net/yacy/cora/services/federated/solr/SingleSolrConnector.java @@ -0,0 +1,138 @@ +/** + * SolrSingleConnector + * Copyright 2011 by Michael Peter Christen + * First released 14.04.2011 at http://yacy.net + * + * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ + * $LastChangedRevision: 7654 $ + * $LastChangedBy: orbiter $ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.services.federated.solr; + +import java.io.File; +import java.io.IOException; +import java.net.InetAddress; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.protocol.Domains; + +import org.apache.http.HttpHost; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.AuthCache; +import org.apache.http.client.protocol.ClientContext; +import org.apache.http.impl.auth.BasicScheme; +import org.apache.http.impl.client.BasicAuthCache; +import org.apache.http.impl.client.BasicCredentialsProvider; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.http.protocol.HttpContext; +import org.apache.solr.client.solrj.impl.HttpSolrServer; + + +public class SingleSolrConnector extends AbstractSolrConnector implements SolrConnector { + + private final String solrurl, host, solrpath, solraccount, solrpw; + private final int port; + + /** + * create a new solr connector + * @param url the solr url, like http://192.168.1.60:8983/solr/ or http://admin:pw@192.168.1.60:8983/solr/ + * @param scheme + * @throws IOException + */ + public SingleSolrConnector(final String url) throws IOException { + super(); + this.solrurl = url; + + // connect using authentication + final MultiProtocolURI u = new MultiProtocolURI(this.solrurl); + this.host = u.getHost(); + this.port = u.getPort(); + this.solrpath = u.getPath(); + final String userinfo = u.getUserInfo(); + if (userinfo == null || userinfo.length() == 0) { + this.solraccount = ""; this.solrpw = ""; + } else { + final int p = userinfo.indexOf(':'); + if (p < 0) { + this.solraccount = userinfo; this.solrpw = ""; + } else { + this.solraccount = userinfo.substring(0, p); this.solrpw = userinfo.substring(p + 1); + } + } + HttpSolrServer s; + if (this.solraccount.length() > 0) { + final DefaultHttpClient client = new DefaultHttpClient() { + @Override + protected HttpContext createHttpContext() { + HttpContext context = super.createHttpContext(); + AuthCache authCache = new BasicAuthCache(); + BasicScheme basicAuth = new BasicScheme(); + HttpHost targetHost = new HttpHost(u.getHost(), u.getPort(), u.getProtocol()); + authCache.put(targetHost, basicAuth); + context.setAttribute(ClientContext.AUTH_CACHE, authCache); + return context; + } + }; + BasicCredentialsProvider credsProvider = new BasicCredentialsProvider(); + credsProvider.setCredentials(new AuthScope(this.host, AuthScope.ANY_PORT), new UsernamePasswordCredentials(this.solraccount, this.solrpw)); + client.setCredentialsProvider(credsProvider); + s = new HttpSolrServer("http://" + this.host + ":" + this.port + this.solrpath, client); + } else { + s = new HttpSolrServer(this.solrurl); + } + s.setAllowCompression(true); + s.setConnectionTimeout(60000); + s.setMaxRetries(1); // Solr-Doc: No more than 1 recommended (depreciated) + s.setSoTimeout(60000); + super.init(s); + } + + public String getAdminInterface() { + final InetAddress localhostExternAddress = Domains.myPublicLocalIP(); + final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress(); + String u = this.solrurl; + int p = u.indexOf("localhost",0); if (p < 0) p = u.indexOf("127.0.0.1",0); + if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9); + return u + (u.endsWith("/") ? "admin/" : "/admin/"); + } + + public static void main(final String args[]) { + SingleSolrConnector solr; + try { + solr = new SingleSolrConnector("http://127.0.0.1:8983/solr"); + solr.clear(); + final File exampleDir = new File("test/parsertest/"); + long t, t0, a = 0; + int c = 0; + System.out.println("push files in " + exampleDir.getAbsolutePath() + " to Solr"); + for (final String s: exampleDir.list()) { + if (s.startsWith(".")) continue; + t = System.currentTimeMillis(); + solr.add(new File(exampleDir, s), s); + t0 = (System.currentTimeMillis() - t); + a += t0; + c++; + System.out.println("pushed file " + s + " to solr, " + t0 + " milliseconds"); + } + System.out.println("pushed " + c + " files in " + a + " milliseconds, " + (a / c) + " milliseconds average; " + (60000 / a * c) + " PPM"); + } catch (final IOException e) { + e.printStackTrace(); + } + } +} diff --git a/source/net/yacy/cora/services/federated/solr/SolrScheme.java b/source/net/yacy/cora/services/federated/solr/SolrScheme.java deleted file mode 100644 index f58a910d2..000000000 --- a/source/net/yacy/cora/services/federated/solr/SolrScheme.java +++ /dev/null @@ -1,32 +0,0 @@ -/** - * SolrScheme - * Copyright 2011 by Michael Peter Christen - * First released 09.05.2012 at http://yacy.net - * - * $LastChangedDate: 2011-04-14 22:05:04 +0200 (Do, 14 Apr 2011) $ - * $LastChangedRevision: 7654 $ - * $LastChangedBy: orbiter $ - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program in the file lgpl21.txt - * If not, see . - */ - -package net.yacy.cora.services.federated.solr; - - -public interface SolrScheme { - - public SolrDoc toSolr(); - -} diff --git a/source/net/yacy/cora/storage/ConfigurationSet.java b/source/net/yacy/cora/storage/ConfigurationSet.java index 3277b6328..2e0ac8ef4 100644 --- a/source/net/yacy/cora/storage/ConfigurationSet.java +++ b/source/net/yacy/cora/storage/ConfigurationSet.java @@ -38,7 +38,6 @@ import java.util.logging.Level; import java.util.logging.Logger; import net.yacy.cora.storage.ConfigurationSet.Entry; -import net.yacy.kelondro.util.FileUtils; import net.yacy.search.index.SolrField; /** * this class reads configuration attributes as a list of keywords from a list @@ -199,7 +198,7 @@ public class ConfigurationSet extends TreeMap implements Serializa if (this.file == null) return; // create a temporary bak file, use it as template to preserve user comments File bakfile = new File (this.file.getAbsolutePath() + ".bak"); - FileUtils.copy (this.file, bakfile); + Files.copy (this.file, bakfile); @SuppressWarnings("unchecked") TreeMap tclone = (TreeMap) this.clone(); // clone to write appended entries diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaTripleImpl.java b/source/net/yacy/document/parser/rdfa/impl/RDFaTripleImpl.java index 35dc378c5..b1a125f4b 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaTripleImpl.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaTripleImpl.java @@ -14,13 +14,12 @@ import javax.xml.transform.TransformerFactory; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; +import net.yacy.yacy; import net.yacy.document.parser.rdfa.IRDFaTriple; import net.yacy.kelondro.logging.Log; -import net.yacy.yacy; - public class RDFaTripleImpl{ - + private static Templates templates = null; private String propertyURI = null; private String subjectURI = null; @@ -30,59 +29,52 @@ public class RDFaTripleImpl{ private String value = null; private String dataType = null; private String language = null; - private Reader in; - private Transformer aTransformer; - private ArrayList allRDFaTriples = new ArrayList(); + private final Reader in; + private final Transformer aTransformer; + private final ArrayList allRDFaTriples = new ArrayList(); public RDFaTripleImpl(Reader in, String base) throws IOException, TransformerException, TransformerConfigurationException { - + BufferedReader bufReader = new BufferedReader(in); String readLine = bufReader.readLine(); if (!readLine.toLowerCase().contains(" to.lastModified()) try { - FileUtils.copy(from, to); + Files.copy(from, to); } catch (final IOException e) {} } } @@ -118,7 +120,7 @@ public class migration { }else{ try { mkdirs(styleFile.getParentFile()); - FileUtils.copy(skinFile, styleFile); + Files.copy(skinFile, styleFile); Log.logInfo("MIGRATION", "copied new Skinfile"); } catch (final IOException e) { Log.logSevere("MIGRATION", "Cannot copy skinfile."); @@ -164,7 +166,7 @@ public class migration { sb.wikiDB.close(); file2 = new File(sb.workPath, "wiki.db"); try { - FileUtils.copy(file, file2); + Files.copy(file, file2); file.delete(); } catch (final IOException e) { } @@ -174,7 +176,7 @@ public class migration { Log.logInfo("MIGRATION", "Migrating wiki-bkp.db to "+ sb.workPath); file2 = new File(sb.workPath, "wiki-bkp.db"); try { - FileUtils.copy(file, file2); + Files.copy(file, file2); file.delete(); } catch (final IOException e) {} } @@ -192,7 +194,7 @@ public class migration { sb.messageDB.close(); file2=new File(sb.workPath, "message.db"); try { - FileUtils.copy(file, file2); + Files.copy(file, file2); file.delete(); } catch (final IOException e) {} try { diff --git a/source/net/yacy/peers/Network.java b/source/net/yacy/peers/Network.java index fb99b599b..27e81ff30 100644 --- a/source/net/yacy/peers/Network.java +++ b/source/net/yacy/peers/Network.java @@ -351,6 +351,7 @@ public class Network } } } catch ( final Exception e ) { + Log.logException(e); log.logSevere( "publishThread: error with target seed " + this.seed.toString() + ": " + e.getMessage(), e); diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index eb69c65e1..ccb75449a 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -52,6 +52,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.services.federated.yacy.CacheStrategy; +import net.yacy.cora.storage.Files; import net.yacy.document.Document; import net.yacy.document.parser.tarParser; import net.yacy.kelondro.data.meta.DigestURI; @@ -495,7 +496,7 @@ public final class yacyRelease extends yacyVersion { final File InfoPlistSource = new File(sb.getDataPath(), "DATA/RELEASE/yacy/addon/YaCy.app/Contents/Info.plist"); final File InfoPlistDestination = new File(sb.getAppPath(), "addon/YaCy.app/Contents/Info.plist"); if (InfoPlistSource.exists() && InfoPlistDestination.exists()) { - FileUtils.copy(InfoPlistSource, InfoPlistDestination); + Files.copy(InfoPlistSource, InfoPlistDestination); Log.logInfo("UPDATE", "replaced Info.plist"); } } diff --git a/source/net/yacy/peers/operation/yacySeedUploadFile.java b/source/net/yacy/peers/operation/yacySeedUploadFile.java index c7e35d1ca..930ec81d9 100644 --- a/source/net/yacy/peers/operation/yacySeedUploadFile.java +++ b/source/net/yacy/peers/operation/yacySeedUploadFile.java @@ -1,4 +1,4 @@ -//yacySeedUploadFile.java +//yacySeedUploadFile.java //------------------------------------- //part of YACY //(C) by Michael Peter Christen; mc@yacy.net @@ -27,30 +27,32 @@ package net.yacy.peers.operation; import java.io.File; -import net.yacy.kelondro.util.FileUtils; +import com.google.common.io.Files; import de.anomic.server.serverSwitch; public class yacySeedUploadFile implements yacySeedUploader { - + public static final String CONFIG_FILE_PATH = "seedFilePath"; + @Override public String uploadSeedFile(final serverSwitch sb, final File seedFile) throws Exception { - + String seedFilePath = ""; try { seedFilePath = sb.getConfig(CONFIG_FILE_PATH,""); if (seedFilePath.length() == 0) throw new Exception("Path to seed file is not configured properly"); - - final File publicSeedFile = new File(seedFilePath); - FileUtils.copy(seedFile,publicSeedFile); - + + final File publicSeedFile = new File(seedFilePath); + Files.copy(seedFile,publicSeedFile); + return "Seed-List file stored successfully"; } catch (final Exception e) { throw new Exception("Unable to store the seed-list file into the filesystem using path '" + seedFilePath + "'. " + e.getMessage()); } } + @Override public String[] getConfigurationOptions() { return new String[]{CONFIG_FILE_PATH}; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 4fd8657cb..6d1045419 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -93,9 +93,9 @@ import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.protocol.http.ProxySettings; +import net.yacy.cora.services.federated.solr.ShardSelection; +import net.yacy.cora.services.federated.solr.ShardSolrConnector; import net.yacy.cora.services.federated.solr.SolrDoc; -import net.yacy.cora.services.federated.solr.SolrShardingConnector; -import net.yacy.cora.services.federated.solr.SolrShardingSelection; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -151,6 +151,9 @@ import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEventCache; import net.yacy.search.ranking.BlockRank; import net.yacy.search.ranking.RankingProfile; + +import com.google.common.io.Files; + import de.anomic.crawler.Cache; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlQueues; @@ -392,7 +395,7 @@ public final class Switchboard extends serverSwitch getConfig("federated.service.solr.indexing.schemefile", "solr.keys.default.list"); final File solrWorkProfile = new File(getDataPath(), "DATA/SETTINGS/" + schemename); if ( !solrWorkProfile.exists() ) { - FileUtils.copy(solrBackupProfile, solrWorkProfile); + Files.copy(solrBackupProfile, solrWorkProfile); } final SolrConfiguration backupScheme = new SolrConfiguration(solrBackupProfile); this.solrScheme = new SolrConfiguration(solrWorkProfile); @@ -407,9 +410,9 @@ public final class Switchboard extends serverSwitch try { this.indexSegments.segment(Segments.Process.LOCALCRAWLING).connectSolr( - (usesolr) ? new SolrShardingConnector( + (usesolr) ? new ShardSolrConnector( solrurls, - SolrShardingSelection.Method.MODULO_HOST_MD5, + ShardSelection.Method.MODULO_HOST_MD5, 10000, true) : null); } catch ( final IOException e ) { Log.logException(e); @@ -731,7 +734,7 @@ public final class Switchboard extends serverSwitch getDataPath(SwitchboardConstants.HTDOCS_PATH, SwitchboardConstants.HTDOCS_PATH_DEFAULT), "notifier.gif"); try { - FileUtils.copy(notifierSource, notifierDest); + Files.copy(notifierSource, notifierDest); } catch ( final IOException e ) { } diff --git a/source/net/yacy/search/solr/EmbeddedSolrConnector.java b/source/net/yacy/search/solr/EmbeddedSolrConnector.java new file mode 100644 index 000000000..496f91574 --- /dev/null +++ b/source/net/yacy/search/solr/EmbeddedSolrConnector.java @@ -0,0 +1,120 @@ +/** + * EmbeddedSolrConnector + * Copyright 2012 by Michael Peter Christen + * First released 21.06.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.search.solr; + +import java.io.File; +import java.io.IOException; + +import javax.xml.parsers.ParserConfigurationException; + +import net.yacy.cora.services.federated.solr.AbstractSolrConnector; +import net.yacy.cora.services.federated.solr.SolrConnector; +import net.yacy.cora.services.federated.solr.SolrDoc; +import net.yacy.kelondro.logging.Log; +import net.yacy.search.index.SolrField; + +import org.apache.solr.client.solrj.SolrServer; +import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; +import org.apache.solr.client.solrj.embedded.JettySolrRunner; +import org.apache.solr.client.solrj.impl.HttpSolrServer; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.core.CoreContainer; +import org.xml.sax.SAXException; + +import com.google.common.io.Files; + +public class EmbeddedSolrConnector extends AbstractSolrConnector implements SolrConnector { + + private final CoreContainer core; + private final static String[] confFiles = {"solrconfig.xml", "schema.xml", "stopwords.txt", "synonyms.txt", "protwords.txt", "currency.xml", "elevate.xml", "lang/"}; + //private final static String[] confFiles = {"solrconfig.xml", "schema.xml", "stopwords.txt", "synonyms.txt", "protwords.txt", "currency.xml", "elevate.xml", "lang/"}; + + public EmbeddedSolrConnector(File storagePath, File solr_config) throws IOException { + super(); + // copy the solrconfig.xml to the storage path + File conf = new File(storagePath, "conf"); + conf.mkdirs(); + File source, target; + for (String cf: confFiles) { + source = new File(solr_config, cf); + if (source.isDirectory()) { + target = new File(conf, cf); + target.mkdirs(); + for (String cfl: source.list()) { + Files.copy(new File(source, cfl), new File(target, cfl)); + } + } else { + target = new File(conf, cf); + target.getParentFile().mkdirs(); + Files.copy(source, target); + } + } + try { + this.core = new CoreContainer(storagePath.getAbsolutePath(), new File(solr_config, "solr.xml")); + } catch (ParserConfigurationException e) { + throw new IOException(e.getMessage(), e); + } catch (SAXException e) { + throw new IOException(e.getMessage(), e); + } + super.init(new EmbeddedSolrServer(this.core, "collection1")); + } + + @Override + public void close() { + super.close(); + this.core.shutdown(); + } + + public static void main(String[] args) { + File solr_config = new File("defaults/solr"); + File storage = new File("DATA/INDEX/webportal/SEGMENTS/text/solr/"); + storage.mkdirs(); + try { + EmbeddedSolrConnector solr = new EmbeddedSolrConnector(storage, solr_config); + SolrDoc solrdoc = new SolrDoc(); + solrdoc.addSolr(SolrField.id, "ABCD0000abcd"); + solrdoc.addSolr(SolrField.title, "Lorem ipsum"); + solrdoc.addSolr(SolrField.text_t, "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."); + solr.add(solrdoc); + SolrDocumentList searchresult = solr.get(SolrField.text_t.name() + ":tempor", 0, 10); + for (SolrDocument d: searchresult) { + System.out.println(d.toString()); + } + solr.close(); + /* + JettySolrRunner solrJetty = new JettySolrRunner("/solr", 8091, storage.getAbsolutePath()); + try { + solrJetty.start(); + String url = "http://localhost:" + solrJetty.getLocalPort() + "/solr"; + SolrServer server = new HttpSolrServer(url); + } catch (Exception e) { + e.printStackTrace(); + } + */ + } catch (IOException e) { + Log.logException(e); + } + + } + +} diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java index 2d0f92b9b..ac62f5674 100644 --- a/source/net/yacy/yacy.java +++ b/source/net/yacy/yacy.java @@ -80,6 +80,9 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.MetadataRepository; import net.yacy.search.index.Segment; + +import com.google.common.io.Files; + import de.anomic.data.Translator; import de.anomic.http.server.HTTPDemon; import de.anomic.server.serverCore; @@ -181,7 +184,7 @@ public final class yacy { f = new File(dataHome, "DATA/LOG/yacy.logging"); final File f0 = new File(appHome, "defaults/yacy.logging"); if (!f.exists() || f0.lastModified() > f.lastModified()) try { - FileUtils.copy(f0, f); + Files.copy(f0, f); } catch (final IOException e){ System.out.println("could not copy yacy.logging"); } @@ -268,7 +271,7 @@ public final class yacy { // create default notifier picture //TODO: Use templates instead of copying images ... if (!((new File(htDocsPath, "notifier.gif")).exists())) try { - FileUtils.copy(new File(htRootPath, "env/grafics/empty.gif"), + Files.copy(new File(htRootPath, "env/grafics/empty.gif"), new File(htDocsPath, "notifier.gif")); } catch (final IOException e) {}