normalizer
editnormalizer
editThe normalizer
property of keyword
fields is similar to
analyzer
except that it guarantees that the analysis chain
produces a single token.
The normalizer
is applied prior to indexing the keyword, as well as at
search-time when the keyword
field is searched via a query parser such as
the match
query or via a term-level query
such as the term
query.
A simple normalizer called lowercase
ships with elasticsearch and can be used.
Custom normalizers can be defined as part of analysis settings as follows.
resp = client.indices.create( index="index", settings={ "analysis": { "normalizer": { "my_normalizer": { "type": "custom", "char_filter": [], "filter": [ "lowercase", "asciifolding" ] } } } }, mappings={ "properties": { "foo": { "type": "keyword", "normalizer": "my_normalizer" } } }, ) print(resp) resp1 = client.index( index="index", id="1", document={ "foo": "BÀR" }, ) print(resp1) resp2 = client.index( index="index", id="2", document={ "foo": "bar" }, ) print(resp2) resp3 = client.index( index="index", id="3", document={ "foo": "baz" }, ) print(resp3) resp4 = client.indices.refresh( index="index", ) print(resp4) resp5 = client.search( index="index", query={ "term": { "foo": "BAR" } }, ) print(resp5) resp6 = client.search( index="index", query={ "match": { "foo": "BAR" } }, ) print(resp6)
response = client.indices.create( index: 'index', body: { settings: { analysis: { normalizer: { my_normalizer: { type: 'custom', char_filter: [], filter: [ 'lowercase', 'asciifolding' ] } } } }, mappings: { properties: { foo: { type: 'keyword', normalizer: 'my_normalizer' } } } } ) puts response response = client.index( index: 'index', id: 1, body: { foo: 'BÀR' } ) puts response response = client.index( index: 'index', id: 2, body: { foo: 'bar' } ) puts response response = client.index( index: 'index', id: 3, body: { foo: 'baz' } ) puts response response = client.indices.refresh( index: 'index' ) puts response response = client.search( index: 'index', body: { query: { term: { foo: 'BAR' } } } ) puts response response = client.search( index: 'index', body: { query: { match: { foo: 'BAR' } } } ) puts response
const response = await client.indices.create({ index: "index", settings: { analysis: { normalizer: { my_normalizer: { type: "custom", char_filter: [], filter: ["lowercase", "asciifolding"], }, }, }, }, mappings: { properties: { foo: { type: "keyword", normalizer: "my_normalizer", }, }, }, }); console.log(response); const response1 = await client.index({ index: "index", id: 1, document: { foo: "BÀR", }, }); console.log(response1); const response2 = await client.index({ index: "index", id: 2, document: { foo: "bar", }, }); console.log(response2); const response3 = await client.index({ index: "index", id: 3, document: { foo: "baz", }, }); console.log(response3); const response4 = await client.indices.refresh({ index: "index", }); console.log(response4); const response5 = await client.search({ index: "index", query: { term: { foo: "BAR", }, }, }); console.log(response5); const response6 = await client.search({ index: "index", query: { match: { foo: "BAR", }, }, }); console.log(response6);
PUT index { "settings": { "analysis": { "normalizer": { "my_normalizer": { "type": "custom", "char_filter": [], "filter": ["lowercase", "asciifolding"] } } } }, "mappings": { "properties": { "foo": { "type": "keyword", "normalizer": "my_normalizer" } } } } PUT index/_doc/1 { "foo": "BÀR" } PUT index/_doc/2 { "foo": "bar" } PUT index/_doc/3 { "foo": "baz" } POST index/_refresh GET index/_search { "query": { "term": { "foo": "BAR" } } } GET index/_search { "query": { "match": { "foo": "BAR" } } }
The above queries match documents 1 and 2 since BÀR
is converted to bar
at
both index and query time.
{ "took": $body.took, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped" : 0, "failed": 0 }, "hits": { "total" : { "value": 2, "relation": "eq" }, "max_score": 0.4700036, "hits": [ { "_index": "index", "_id": "1", "_score": 0.4700036, "_source": { "foo": "BÀR" } }, { "_index": "index", "_id": "2", "_score": 0.4700036, "_source": { "foo": "bar" } } ] } }
Also, the fact that keywords are converted prior to indexing also means that aggregations return normalized values:
resp = client.search( index="index", size=0, aggs={ "foo_terms": { "terms": { "field": "foo" } } }, ) print(resp)
response = client.search( index: 'index', body: { size: 0, aggregations: { foo_terms: { terms: { field: 'foo' } } } } ) puts response
const response = await client.search({ index: "index", size: 0, aggs: { foo_terms: { terms: { field: "foo", }, }, }, }); console.log(response);
GET index/_search { "size": 0, "aggs": { "foo_terms": { "terms": { "field": "foo" } } } }
returns
{ "took": 43, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped" : 0, "failed": 0 }, "hits": { "total" : { "value": 3, "relation": "eq" }, "max_score": null, "hits": [] }, "aggregations": { "foo_terms": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "bar", "doc_count": 2 }, { "key": "baz", "doc_count": 1 } ] } } }