Limit token count token filter
editLimit token count token filter
editLimits the number of output tokens. The limit
filter is commonly used to limit
the size of document field values based on token count.
By default, the limit
filter keeps only the first token in a stream. For
example, the filter can change the token stream [ one, two, three ]
to
[ one ]
.
This filter uses Lucene’s LimitTokenCountFilter.
If you want to limit the size of field values based on _character length_, use the <<ignore-above,`ignore_above`>> mapping parameter.
Configurable parameters
edit-
max_token_count
-
(Optional, integer)
Maximum number of tokens to keep. Once this limit is reached, any remaining
tokens are excluded from the output. Defaults to
1
. -
consume_all_tokens
-
(Optional, Boolean)
If
true
, thelimit
filter exhausts the token stream, even if themax_token_count
has already been reached. Defaults tofalse
.
Example
editThe following analyze API request uses the limit
filter to keep only the first two tokens in quick fox jumps over lazy dog
:
resp = client.indices.analyze( tokenizer="standard", filter=[ { "type": "limit", "max_token_count": 2 } ], text="quick fox jumps over lazy dog", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'standard', filter: [ { type: 'limit', max_token_count: 2 } ], text: 'quick fox jumps over lazy dog' } ) puts response
const response = await client.indices.analyze({ tokenizer: "standard", filter: [ { type: "limit", max_token_count: 2, }, ], text: "quick fox jumps over lazy dog", }); console.log(response);
GET _analyze { "tokenizer": "standard", "filter": [ { "type": "limit", "max_token_count": 2 } ], "text": "quick fox jumps over lazy dog" }
The filter produces the following tokens:
[ quick, fox ]
Add to an analyzer
editThe following create index API request uses the
limit
filter to configure a new
custom analyzer.
resp = client.indices.create( index="limit_example", settings={ "analysis": { "analyzer": { "standard_one_token_limit": { "tokenizer": "standard", "filter": [ "limit" ] } } } }, ) print(resp)
response = client.indices.create( index: 'limit_example', body: { settings: { analysis: { analyzer: { standard_one_token_limit: { tokenizer: 'standard', filter: [ 'limit' ] } } } } } ) puts response
const response = await client.indices.create({ index: "limit_example", settings: { analysis: { analyzer: { standard_one_token_limit: { tokenizer: "standard", filter: ["limit"], }, }, }, }, }); console.log(response);
PUT limit_example { "settings": { "analysis": { "analyzer": { "standard_one_token_limit": { "tokenizer": "standard", "filter": [ "limit" ] } } } } }
Customize
editTo customize the limit
filter, duplicate it to create the basis
for a new custom token filter. You can modify the filter using its configurable
parameters.
For example, the following request creates a custom limit
filter that keeps
only the first five tokens of a stream:
resp = client.indices.create( index="custom_limit_example", settings={ "analysis": { "analyzer": { "whitespace_five_token_limit": { "tokenizer": "whitespace", "filter": [ "five_token_limit" ] } }, "filter": { "five_token_limit": { "type": "limit", "max_token_count": 5 } } } }, ) print(resp)
response = client.indices.create( index: 'custom_limit_example', body: { settings: { analysis: { analyzer: { whitespace_five_token_limit: { tokenizer: 'whitespace', filter: [ 'five_token_limit' ] } }, filter: { five_token_limit: { type: 'limit', max_token_count: 5 } } } } } ) puts response
const response = await client.indices.create({ index: "custom_limit_example", settings: { analysis: { analyzer: { whitespace_five_token_limit: { tokenizer: "whitespace", filter: ["five_token_limit"], }, }, filter: { five_token_limit: { type: "limit", max_token_count: 5, }, }, }, }, }); console.log(response);
PUT custom_limit_example { "settings": { "analysis": { "analyzer": { "whitespace_five_token_limit": { "tokenizer": "whitespace", "filter": [ "five_token_limit" ] } }, "filter": { "five_token_limit": { "type": "limit", "max_token_count": 5 } } } } }