Change: #532 ElasticSearch設定の外出し (#650)

* Change: #532 ElasticSearch設定の外出し

* バージョンチェック

* 起動時にエラー
This commit is contained in:
KMY(雪あすか) 2024-03-12 12:11:13 +09:00 committed by GitHub
parent 8e7c66522e
commit a8fbcb3fb6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 508 additions and 269 deletions

3
.gitignore vendored
View file

@ -28,6 +28,9 @@
/node_modules/ /node_modules/
/build/ /build/
# Ignore elasticsearch config
/.elasticsearch.yml
# Ignore Vagrant files # Ignore Vagrant files
.vagrant/ .vagrant/

View file

@ -3,83 +3,9 @@
class AccountsIndex < Chewy::Index class AccountsIndex < Chewy::Index
include DatetimeClampingConcern include DatetimeClampingConcern
settings index: index_preset(refresh_interval: '30s'), analysis: { # ElasticSearch config is moved to "/config/elasticsearch.default.yml".
filter: { # Edit it when original Mastodon changed ElasticSearch config.
english_stop: { settings index: index_preset(refresh_interval: '30s'), analysis: ChewyConfig.instance.accounts
type: 'stop',
stopwords: '_english_',
},
english_stemmer: {
type: 'stemmer',
language: 'english',
},
english_possessive_stemmer: {
type: 'stemmer',
language: 'possessive_english',
},
my_posfilter: {
type: 'sudachi_part_of_speech',
stoptags: [
'助詞',
'助動詞',
'補助記号,句点',
'補助記号,読点',
],
},
},
analyzer: {
natural: {
tokenizer: 'standard',
filter: %w(
lowercase
asciifolding
cjk_width
elision
english_possessive_stemmer
english_stop
english_stemmer
),
},
sudachi_analyzer: {
filter: %w(
my_posfilter
sudachi_normalizedform
),
type: 'custom',
tokenizer: 'sudachi_tokenizer',
},
verbatim: {
tokenizer: 'standard',
filter: %w(lowercase asciifolding cjk_width),
},
edge_ngram: {
tokenizer: 'edge_ngram',
filter: %w(lowercase asciifolding cjk_width),
},
},
tokenizer: {
edge_ngram: {
type: 'edge_ngram',
min_gram: 1,
max_gram: 15,
},
sudachi_tokenizer: {
resources_path: '/etc/elasticsearch/sudachi',
split_mode: 'A',
type: 'sudachi_tokenizer',
discard_punctuation: 'true',
},
},
}
index_scope ::Account.searchable.includes(:account_stat) index_scope ::Account.searchable.includes(:account_stat)
@ -90,8 +16,15 @@ class AccountsIndex < Chewy::Index
field(:properties, type: 'keyword', value: ->(account) { account.searchable_properties }) field(:properties, type: 'keyword', value: ->(account) { account.searchable_properties })
field(:last_status_at, type: 'date', value: ->(account) { clamp_date(account.last_status_at || account.created_at) }) field(:last_status_at, type: 'date', value: ->(account) { clamp_date(account.last_status_at || account.created_at) })
field(:domain, type: 'keyword', value: ->(account) { account.domain || '' }) field(:domain, type: 'keyword', value: ->(account) { account.domain || '' })
field(:display_name, type: 'text', analyzer: 'verbatim') { field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'verbatim' } field(:display_name, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('display_name', 'analyzer')) do
field(:username, type: 'text', analyzer: 'verbatim', value: ->(account) { [account.username, account.domain].compact.join('@') }) { field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'verbatim' } field :edge_ngram, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('display_name', 'edge_ngram', 'analyzer'), search_analyzer: ChewyConfig.instance.accounts_analyzers.dig('display_name', 'edge_ngram', 'search_analyzer')
field(:text, type: 'text', analyzer: 'sudachi_analyzer', value: ->(account) { account.searchable_text }) { field(:stemmed, type: 'text', analyzer: 'natural') } end
field(:username, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('username', 'analyzer'), value: lambda { |account|
[account.username, account.domain].compact.join('@')
}) do
field :edge_ngram, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('username', 'edge_ngram', 'analyzer'),
search_analyzer: ChewyConfig.instance.accounts_analyzers.dig('username', 'edge_ngram', 'search_analyzer')
end
field(:text, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('text', 'analyzer'), value: ->(account) { account.searchable_text }) { field(:stemmed, type: 'text', analyzer: ChewyConfig.instance.accounts_analyzers.dig('text', 'stemmed', 'analyzer')) }
end end
end end

View file

@ -3,81 +3,9 @@
class PublicStatusesIndex < Chewy::Index class PublicStatusesIndex < Chewy::Index
include DatetimeClampingConcern include DatetimeClampingConcern
settings index: index_preset(refresh_interval: '30s', number_of_shards: 5), analysis: { # ElasticSearch config is moved to "/config/elasticsearch.default.yml".
filter: { # Edit it when original Mastodon changed ElasticSearch config.
english_stop: { settings index: index_preset(refresh_interval: '30s', number_of_shards: 5), analysis: ChewyConfig.instance.public_statuses
type: 'stop',
stopwords: '_english_',
},
english_stemmer: {
type: 'stemmer',
language: 'english',
},
english_possessive_stemmer: {
type: 'stemmer',
language: 'possessive_english',
},
my_posfilter: {
type: 'sudachi_part_of_speech',
stoptags: [
'助詞',
'助動詞',
'補助記号,句点',
'補助記号,読点',
],
},
},
analyzer: {
content: {
tokenizer: 'uax_url_email',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
),
},
hashtag: {
tokenizer: 'keyword',
filter: %w(
word_delimiter_graph
lowercase
asciifolding
cjk_width
),
},
sudachi_analyzer: {
tokenizer: 'sudachi_tokenizer',
type: 'custom',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
my_posfilter
sudachi_normalizedform
),
},
},
tokenizer: {
sudachi_tokenizer: {
resources_path: '/etc/elasticsearch/sudachi',
split_mode: 'A',
type: 'sudachi_tokenizer',
discard_punctuation: 'true',
},
},
}
index_scope ::Status.unscoped index_scope ::Status.unscoped
.kept .kept
@ -87,8 +15,8 @@ class PublicStatusesIndex < Chewy::Index
root date_detection: false do root date_detection: false do
field(:id, type: 'long') field(:id, type: 'long')
field(:account_id, type: 'long') field(:account_id, type: 'long')
field(:text, type: 'text', analyzer: 'sudachi_analyzer', value: ->(status) { status.searchable_text }) { field(:stemmed, type: 'text', analyzer: 'content') } field(:text, type: 'text', analyzer: ChewyConfig.instance.public_statuses_analyzers.dig('text', 'analyzer'), value: ->(status) { status.searchable_text }) { field(:stemmed, type: 'text', analyzer: ChewyConfig.instance.public_statuses_analyzers.dig('text', 'stemmed', 'analyzer')) }
field(:tags, type: 'text', analyzer: 'hashtag', value: ->(status) { status.tags.map(&:display_name) }) field(:tags, type: 'text', analyzer: ChewyConfig.instance.public_statuses_analyzers.dig('tags', 'analyzer'), value: ->(status) { status.tags.map(&:display_name) })
field(:language, type: 'keyword') field(:language, type: 'keyword')
field(:domain, type: 'keyword', value: ->(status) { status.account.domain || '' }) field(:domain, type: 'keyword', value: ->(status) { status.account.domain || '' })
field(:properties, type: 'keyword', value: ->(status) { status.searchable_properties }) field(:properties, type: 'keyword', value: ->(status) { status.searchable_properties })

View file

@ -3,85 +3,9 @@
class StatusesIndex < Chewy::Index class StatusesIndex < Chewy::Index
include DatetimeClampingConcern include DatetimeClampingConcern
settings index: index_preset(refresh_interval: '30s', number_of_shards: 5), analysis: { # ElasticSearch config is moved to "/config/elasticsearch.default.yml".
filter: { # Edit it when original Mastodon changed ElasticSearch config.
english_stop: { settings index: index_preset(refresh_interval: '30s', number_of_shards: 5), analysis: ChewyConfig.instance.statuses
type: 'stop',
stopwords: '_english_',
},
english_stemmer: {
type: 'stemmer',
language: 'english',
},
english_possessive_stemmer: {
type: 'stemmer',
language: 'possessive_english',
},
my_posfilter: {
type: 'sudachi_part_of_speech',
stoptags: [
'助詞',
'助動詞',
'補助記号,句点',
'補助記号,読点',
],
},
},
analyzer: {
verbatim: {
tokenizer: 'uax_url_email',
filter: %w(lowercase),
},
content: {
tokenizer: 'uax_url_email',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
),
},
hashtag: {
tokenizer: 'keyword',
filter: %w(
word_delimiter_graph
lowercase
asciifolding
cjk_width
),
},
sudachi_analyzer: {
tokenizer: 'sudachi_tokenizer',
type: 'custom',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
my_posfilter
sudachi_normalizedform
),
},
},
tokenizer: {
sudachi_tokenizer: {
resources_path: '/etc/elasticsearch/sudachi',
split_mode: 'A',
type: 'sudachi_tokenizer',
discard_punctuation: 'true',
},
},
}
index_scope ::Status.unscoped.kept.without_reblogs.includes( index_scope ::Status.unscoped.kept.without_reblogs.includes(
:account, :account,
@ -107,8 +31,8 @@ class StatusesIndex < Chewy::Index
root date_detection: false do root date_detection: false do
field(:id, type: 'long') field(:id, type: 'long')
field(:account_id, type: 'long') field(:account_id, type: 'long')
field(:text, type: 'text', analyzer: 'sudachi_analyzer', value: ->(status) { status.searchable_text }) { field(:stemmed, type: 'text', analyzer: 'content') } field(:text, type: 'text', analyzer: ChewyConfig.instance.statuses_analyzers.dig('text', 'analyzer'), value: ->(status) { status.searchable_text }) { field(:stemmed, type: 'text', analyzer: ChewyConfig.instance.statuses_analyzers.dig('text', 'stemmed', 'analyzer')) }
field(:tags, type: 'text', analyzer: 'hashtag', value: ->(status) { status.tags.map(&:display_name) }) field(:tags, type: 'text', analyzer: ChewyConfig.instance.statuses_analyzers.dig('tags', 'analyzer'), value: ->(status) { status.tags.map(&:display_name) })
field(:searchable_by, type: 'long', value: ->(status) { status.searchable_by }) field(:searchable_by, type: 'long', value: ->(status) { status.searchable_by })
field(:mentioned_by, type: 'long', value: ->(status) { status.mentioned_by }) field(:mentioned_by, type: 'long', value: ->(status) { status.mentioned_by })
field(:favourited_by, type: 'long', value: ->(status) { status.favourited_by }) field(:favourited_by, type: 'long', value: ->(status) { status.favourited_by })

View file

@ -3,36 +3,9 @@
class TagsIndex < Chewy::Index class TagsIndex < Chewy::Index
include DatetimeClampingConcern include DatetimeClampingConcern
settings index: index_preset(refresh_interval: '30s'), analysis: { # ElasticSearch config is moved to "/config/elasticsearch.default.yml".
analyzer: { # Edit it when original Mastodon changed ElasticSearch config.
content: { settings index: index_preset(refresh_interval: '30s'), analysis: ChewyConfig.instance.tags
tokenizer: 'keyword',
filter: %w(
word_delimiter_graph
lowercase
asciifolding
cjk_width
),
},
edge_ngram: {
tokenizer: 'edge_ngram',
filter: %w(
lowercase
asciifolding
cjk_width
),
},
},
tokenizer: {
edge_ngram: {
type: 'edge_ngram',
min_gram: 2,
max_gram: 15,
},
},
}
index_scope ::Tag.listable index_scope ::Tag.listable
@ -41,7 +14,9 @@ class TagsIndex < Chewy::Index
end end
root date_detection: false do root date_detection: false do
field(:name, type: 'text', analyzer: 'content', value: :display_name) { field(:edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content') } field(:name, type: 'text', analyzer: ChewyConfig.instance.tags_analyzers.dig('name', 'analyzer'), value: :display_name) do
field(:edge_ngram, type: 'text', analyzer: ChewyConfig.instance.tags_analyzers.dig('name', 'edge_ngram', 'analyzer'), search_analyzer: ChewyConfig.instance.tags_analyzers.dig('name', 'edge_ngram', 'search_analyzer'))
end
field(:reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? }) field(:reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? })
field(:usage, type: 'long', value: ->(tag, crutches) { tag.history.aggregate(crutches.time_period).accounts }) field(:usage, type: 'long', value: ->(tag, crutches) { tag.history.aggregate(crutches.time_period).accounts })
field(:last_status_at, type: 'date', value: ->(tag) { clamp_date(tag.last_status_at || tag.created_at) }) field(:last_status_at, type: 'date', value: ->(tag) { clamp_date(tag.last_status_at || tag.created_at) })

60
app/lib/chewy_config.rb Normal file
View file

@ -0,0 +1,60 @@
# frozen_string_literal: true
require 'singleton'
require 'yaml'
class ChewyConfig
include Singleton
class InvalidElasticSearchVersionError < Mastodon::Error; end
CONFIG_VERSION = 1
def initialize
custom_config_file = Rails.root.join('.elasticsearch.yml')
default_config_file = Rails.root.join('config', 'elasticsearch.default.yml')
custom_config = nil
custom_config = YAML.load_file(custom_config_file) if File.exist?(custom_config_file)
default_config = YAML.load_file(default_config_file)
@config = default_config.merge(custom_config || {})
@config = @config.merge(YAML.load_file(Rails.root.join('config', 'elasticsearch.default-ja-sudachi.yml'))) if Rails.env.test?
raise InvalidElasticSearchVersionError, "ElasticSearch config version is missmatch. expected version=#{CONFIG_VERSION} actual version=#{@config['version']}" if @config['version'] != CONFIG_VERSION
end
attr_reader :config
def accounts
config['accounts']
end
def accounts_analyzers
config['accounts_analyzers']
end
def public_statuses
config['public_statuses']
end
def public_statuses_analyzers
config['public_statuses_analyzers']
end
def statuses
config['statuses']
end
def statuses_analyzers
config['statuses_analyzers']
end
def tags
config['tags']
end
def tags_analyzers
config['tags_analyzers']
end
end

View file

@ -0,0 +1,234 @@
# This is a configuration file for environments that use Japanese and Sudachi plug-ins.
# To use this file, copy it to the Mastodon root directory and rename the file to ".elasticsearch.yml".
version: 1
accounts:
filter:
english_stop:
type: stop
stopwords: _english_
english_stemmer:
type: stemmer
language: english
english_possessive_stemmer:
type: stemmer
language: possessive_english
my_posfilter:
type: sudachi_part_of_speech
stoptags:
- 助詞
- 助動詞
- 補助記号,句点
- 補助記号,読点
analyzer:
natural:
tokenizer: standard
filter:
- lowercase
- asciifolding
- cjk_width
- elision
- english_possessive_stemmer
- english_stop
- english_stemmer
sudachi_analyzer:
type: custom
tokenizer: sudachi_tokenizer
filter:
- my_posfilter
- sudachi_normalizedform
verbatim:
tokenizer: standard
filter:
- lowercase
- asciifolding
- cjk_width
edge_ngram:
tokenizer: edge_ngram
filter:
- lowercase
- asciifolding
- cjk_width
tokenizer:
edge_ngram:
type: edge_ngram
min_gram: 1
max_gram: 15
sudachi_tokenizer:
resources_path: '/etc/elasticsearch/sudachi'
split_mode: A
type: sudachi_tokenizer
discard_punctuation: 'true'
public_statuses:
filter:
english_stop:
type: stop
stopwords: _english_
english_stemmer:
type: stemmer
language: english
english_possessive_stemmer:
type: stemmer
language: possessive_english
my_posfilter:
type: sudachi_part_of_speech
stoptags:
- 助詞
- 助動詞
- 補助記号,句点
- 補助記号,読点
analyzer:
content:
tokenizer: uax_url_email
filter:
- english_possessive_stemmer
- lowercase
- asciifolding
- cjk_width
- english_stop
- english_stemmer
hashtag:
tokenizer: keyword
filter:
- word_delimiter_graph
- lowercase
- asciifolding
- cjk_width
sudachi_analyzer:
tokenizer: sudachi_tokenizer
type: custom
filter:
- english_possessive_stemmer
- lowercase
- asciifolding
- cjk_width
- english_stop
- english_stemmer
- my_posfilter
- sudachi_normalizedform
tokenizer:
sudachi_tokenizer:
resources_path: '/etc/elasticsearch/sudachi'
split_mode: A
type: sudachi_tokenizer
discard_punctuation: 'true'
statuses:
filter:
english_stop:
type: stop
stopwords: _english_
english_stemmer:
type: stemmer
language: english
english_possessive_stemmer:
type: stemmer
language: possessive_english
my_posfilter:
type: sudachi_part_of_speech
stoptags:
- 助詞
- 助動詞
- 補助記号,句点
- 補助記号,読点
analyzer:
verbatim:
tokenizer: uax_url_email
filter:
- lowercase
content:
tokenizer: uax_url_email
filter:
- english_possessive_stemmer
- lowercase
- asciifolding
- cjk_width
- english_stop
- english_stemmer
hashtag:
tokenizer: keyword
filter:
- word_delimiter_graph
- lowercase
- asciifolding
- cjk_width
sudachi_analyzer:
tokenizer: sudachi_tokenizer
type: custom
filter:
- english_possessive_stemmer
- lowercase
- asciifolding
- cjk_width
- english_stop
- english_stemmer
- my_posfilter
- sudachi_normalizedform
tags:
analyzer:
content:
tokenizer: keyword
filter:
- word_delimiter_graph
- lowercase
- asciifolding
- cjk_width
edge_ngram:
tokenizer: edge_ngram
filter:
- lowercase
- asciifolding
- cjk_width
tokenizer:
edge_ngram:
type: edge_ngram
min_gram: 2
max_gram: 15
accounts_analyzers:
display_name:
analyzer: verbatim
edge_ngram:
analyzer: edge_ngram
search_analyzer: verbatim
username:
analyzer: verbatim
edge_ngram:
analyzer: edge_ngram
search_analyzer: verbatim
text:
analyzer: sudachi_analyzer
stemmed:
analyzer: natural
public_statuses_analyzers:
text:
analyzer: sudachi_analyzer
stemmed:
analyzer: content
tags:
analyzer: hashtag
statuses_analyzers:
text:
analyzer: sudachi_analyzer
stemmed:
analyzer: content
tags:
analyzer: hashtag
tags_analyzers:
name:
analyzer: content
edge_ngram:
analyzer: edge_ngram
search_analyzer: content

View file

@ -0,0 +1,177 @@
# The standard ElasticSearch settings described in the original Mastodon code are stored.
# This configuration file is overridden by creating a ".elasticsearch.yml" file in the Mastodon root directory.
version: 1
accounts:
filter:
english_stop:
type: stop
stopwords: _english_
english_stemmer:
type: stemmer
language: english
english_possessive_stemmer:
type: stemmer
language: possessive_english
analyzer:
natural:
tokenizer: standard
filter:
- lowercase
- asciifolding
- cjk_width
- elision
- english_possessive_stemmer
- english_stop
- english_stemmer
verbatim:
tokenizer: standard
filter:
- lowercase
- asciifolding
- cjk_width
edge_ngram:
tokenizer: edge_ngram
filter:
- lowercase
- asciifolding
- cjk_width
tokenizer:
edge_ngram:
type: edge_ngram
min_gram: 1
max_gram: 15
public_statuses:
filter:
english_stop:
type: stop
stopwords: _english_
english_stemmer:
type: stemmer
language: english
english_possessive_stemmer:
type: stemmer
language: possessive_english
analyzer:
verbatim:
tokenizer: uax_url_email
filter:
- lowercase
content:
tokenizer: standard
filter:
- lowercase
- asciifolding
- cjk_width
- elision
- english_possessive_stemmer
- english_stop
- english_stemmer
hashtag:
tokenizer: keyword
filter:
- word_delimiter_graph
- lowercase
- asciifolding
- cjk_width
statuses:
filter:
english_stop:
type: stop
stopwords: _english_
english_stemmer:
type: stemmer
language: english
english_possessive_stemmer:
type: stemmer
language: possessive_english
analyzer:
verbatim:
tokenizer: uax_url_email
filter:
- lowercase
content:
tokenizer: standard
filter:
- lowercase
- asciifolding
- cjk_width
- elision
- english_possessive_stemmer
- english_stop
- english_stemmer
hashtag:
tokenizer: standard
filter:
- word_delimiter_graph
- lowercase
- asciifolding
- cjk_width
tags:
analyzer:
content:
tokenizer: keyword
filter:
- word_delimiter_graph
- lowercase
- asciifolding
- cjk_width
edge_ngram:
tokenizer: edge_ngram
filter:
- lowercase
- asciifolding
- cjk_width
tokenizer:
edge_ngram:
type: edge_ngram
min_gram: 2
max_gram: 15
accounts_analyzers:
display_name:
analyzer: verbatim
edge_ngram:
analyzer: edge_ngram
search_analyzer: verbatim
username:
analyzer: verbatim
edge_ngram:
analyzer: edge_ngram
search_analyzer: verbatim
text:
analyzer: verbatim
stemmed:
analyzer: natural
public_statuses_analyzers:
text:
analyzer: verbatim
stemmed:
analyzer: content
tags:
analyzer: hashtag
statuses_analyzers:
text:
analyzer: verbatim
stemmed:
analyzer: content
tags:
analyzer: hashtag
tags_analyzers:
name:
analyzer: content
edge_ngram:
analyzer: edge_ngram
search_analyzer: content

View file

@ -1,5 +1,7 @@
# frozen_string_literal: true # frozen_string_literal: true
require_relative '../../app/lib/chewy_config'
enabled = ENV['ES_ENABLED'] == 'true' enabled = ENV['ES_ENABLED'] == 'true'
host = ENV.fetch('ES_HOST') { 'localhost' } host = ENV.fetch('ES_HOST') { 'localhost' }
port = ENV.fetch('ES_PORT') { 9200 } port = ENV.fetch('ES_PORT') { 9200 }
@ -37,3 +39,6 @@ Chewy.use_after_commit_callbacks = false
# Mastodon is run with hidden services enabled, because # Mastodon is run with hidden services enabled, because
# Elasticsearch is *not* supposed to be accessed through a proxy # Elasticsearch is *not* supposed to be accessed through a proxy
Faraday.ignore_env_proxy = true Faraday.ignore_env_proxy = true
# Check Elasticsearch config file version
ChewyConfig.instance.accounts if enabled