normalizer

edit

The normalizer property of keyword fields is similar to analyzer except that it guarantees that the analysis chain produces a single token.

The normalizer is applied prior to indexing the keyword, as well as at search-time when the keyword field is searched via a query parser such as the match query or via a term-level query such as the term query.

A simple normalizer called lowercase ships with elasticsearch and can be used. Custom normalizers can be defined as part of analysis settings as follows.

resp = client.indices.create(
    index="index",
    settings={
        "analysis": {
            "normalizer": {
                "my_normalizer": {
                    "type": "custom",
                    "char_filter": [],
                    "filter": [
                        "lowercase",
                        "asciifolding"
                    ]
                }
            }
        }
    },
    mappings={
        "properties": {
            "foo": {
                "type": "keyword",
                "normalizer": "my_normalizer"
            }
        }
    },
)
print(resp)

resp1 = client.index(
    index="index",
    id="1",
    document={
        "foo": "BÀR"
    },
)
print(resp1)

resp2 = client.index(
    index="index",
    id="2",
    document={
        "foo": "bar"
    },
)
print(resp2)

resp3 = client.index(
    index="index",
    id="3",
    document={
        "foo": "baz"
    },
)
print(resp3)

resp4 = client.indices.refresh(
    index="index",
)
print(resp4)

resp5 = client.search(
    index="index",
    query={
        "term": {
            "foo": "BAR"
        }
    },
)
print(resp5)

resp6 = client.search(
    index="index",
    query={
        "match": {
            "foo": "BAR"
        }
    },
)
print(resp6)
response = client.indices.create(
  index: 'index',
  body: {
    settings: {
      analysis: {
        normalizer: {
          my_normalizer: {
            type: 'custom',
            char_filter: [],
            filter: [
              'lowercase',
              'asciifolding'
            ]
          }
        }
      }
    },
    mappings: {
      properties: {
        foo: {
          type: 'keyword',
          normalizer: 'my_normalizer'
        }
      }
    }
  }
)
puts response

response = client.index(
  index: 'index',
  id: 1,
  body: {
    foo: 'BÀR'
  }
)
puts response

response = client.index(
  index: 'index',
  id: 2,
  body: {
    foo: 'bar'
  }
)
puts response

response = client.index(
  index: 'index',
  id: 3,
  body: {
    foo: 'baz'
  }
)
puts response

response = client.indices.refresh(
  index: 'index'
)
puts response

response = client.search(
  index: 'index',
  body: {
    query: {
      term: {
        foo: 'BAR'
      }
    }
  }
)
puts response

response = client.search(
  index: 'index',
  body: {
    query: {
      match: {
        foo: 'BAR'
      }
    }
  }
)
puts response
const response = await client.indices.create({
  index: "index",
  settings: {
    analysis: {
      normalizer: {
        my_normalizer: {
          type: "custom",
          char_filter: [],
          filter: ["lowercase", "asciifolding"],
        },
      },
    },
  },
  mappings: {
    properties: {
      foo: {
        type: "keyword",
        normalizer: "my_normalizer",
      },
    },
  },
});
console.log(response);

const response1 = await client.index({
  index: "index",
  id: 1,
  document: {
    foo: "BÀR",
  },
});
console.log(response1);

const response2 = await client.index({
  index: "index",
  id: 2,
  document: {
    foo: "bar",
  },
});
console.log(response2);

const response3 = await client.index({
  index: "index",
  id: 3,
  document: {
    foo: "baz",
  },
});
console.log(response3);

const response4 = await client.indices.refresh({
  index: "index",
});
console.log(response4);

const response5 = await client.search({
  index: "index",
  query: {
    term: {
      foo: "BAR",
    },
  },
});
console.log(response5);

const response6 = await client.search({
  index: "index",
  query: {
    match: {
      foo: "BAR",
    },
  },
});
console.log(response6);
PUT index
{
  "settings": {
    "analysis": {
      "normalizer": {
        "my_normalizer": {
          "type": "custom",
          "char_filter": [],
          "filter": ["lowercase", "asciifolding"]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "foo": {
        "type": "keyword",
        "normalizer": "my_normalizer"
      }
    }
  }
}

PUT index/_doc/1
{
  "foo": "BÀR"
}

PUT index/_doc/2
{
  "foo": "bar"
}

PUT index/_doc/3
{
  "foo": "baz"
}

POST index/_refresh

GET index/_search
{
  "query": {
    "term": {
      "foo": "BAR"
    }
  }
}

GET index/_search
{
  "query": {
    "match": {
      "foo": "BAR"
    }
  }
}

The above queries match documents 1 and 2 since BÀR is converted to bar at both index and query time.

{
  "took": $body.took,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total" : {
        "value": 2,
        "relation": "eq"
    },
    "max_score": 0.4700036,
    "hits": [
      {
        "_index": "index",
        "_id": "1",
        "_score": 0.4700036,
        "_source": {
          "foo": "BÀR"
        }
      },
      {
        "_index": "index",
        "_id": "2",
        "_score": 0.4700036,
        "_source": {
          "foo": "bar"
        }
      }
    ]
  }
}

Also, the fact that keywords are converted prior to indexing also means that aggregations return normalized values:

resp = client.search(
    index="index",
    size=0,
    aggs={
        "foo_terms": {
            "terms": {
                "field": "foo"
            }
        }
    },
)
print(resp)
response = client.search(
  index: 'index',
  body: {
    size: 0,
    aggregations: {
      foo_terms: {
        terms: {
          field: 'foo'
        }
      }
    }
  }
)
puts response
const response = await client.search({
  index: "index",
  size: 0,
  aggs: {
    foo_terms: {
      terms: {
        field: "foo",
      },
    },
  },
});
console.log(response);
GET index/_search
{
  "size": 0,
  "aggs": {
    "foo_terms": {
      "terms": {
        "field": "foo"
      }
    }
  }
}

returns

{
  "took": 43,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total" : {
        "value": 3,
        "relation": "eq"
    },
    "max_score": null,
    "hits": []
  },
  "aggregations": {
    "foo_terms": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": [
        {
          "key": "bar",
          "doc_count": 2
        },
        {
          "key": "baz",
          "doc_count": 1
        }
      ]
    }
  }
}