New

The executive guide to generative AI

Read more

Truncate token filter

edit

Truncates tokens that exceed a specified character limit. This limit defaults to 10 but can be customized using the length parameter.

For example, you can use the truncate filter to shorten all tokens to 3 characters or fewer, changing jumping fox to jum fox.

This filter uses Lucene’s TruncateTokenFilter.

Example

edit

The following analyze API request uses the truncate filter to shorten tokens that exceed 10 characters in the quinquennial extravaganza carried on:

resp = client.indices.analyze(
    tokenizer="whitespace",
    filter=[
        "truncate"
    ],
    text="the quinquennial extravaganza carried on",
)
print(resp)
response = client.indices.analyze(
  body: {
    tokenizer: 'whitespace',
    filter: [
      'truncate'
    ],
    text: 'the quinquennial extravaganza carried on'
  }
)
puts response
const response = await client.indices.analyze({
  tokenizer: "whitespace",
  filter: ["truncate"],
  text: "the quinquennial extravaganza carried on",
});
console.log(response);
GET _analyze
{
  "tokenizer" : "whitespace",
  "filter" : ["truncate"],
  "text" : "the quinquennial extravaganza carried on"
}

The filter produces the following tokens:

[ the, quinquenni, extravagan, carried, on ]

Add to an analyzer

edit

The following create index API request uses the truncate filter to configure a new custom analyzer.

resp = client.indices.create(
    index="custom_truncate_example",
    settings={
        "analysis": {
            "analyzer": {
                "standard_truncate": {
                    "tokenizer": "standard",
                    "filter": [
                        "truncate"
                    ]
                }
            }
        }
    },
)
print(resp)
response = client.indices.create(
  index: 'custom_truncate_example',
  body: {
    settings: {
      analysis: {
        analyzer: {
          standard_truncate: {
            tokenizer: 'standard',
            filter: [
              'truncate'
            ]
          }
        }
      }
    }
  }
)
puts response
const response = await client.indices.create({
  index: "custom_truncate_example",
  settings: {
    analysis: {
      analyzer: {
        standard_truncate: {
          tokenizer: "standard",
          filter: ["truncate"],
        },
      },
    },
  },
});
console.log(response);
PUT custom_truncate_example
{
  "settings" : {
    "analysis" : {
      "analyzer" : {
        "standard_truncate" : {
        "tokenizer" : "standard",
        "filter" : ["truncate"]
        }
      }
    }
  }
}

Configurable parameters

edit
length
(Optional, integer) Character limit for each token. Tokens exceeding this limit are truncated. Defaults to 10.

Customize

edit

To customize the truncate filter, duplicate it to create the basis for a new custom token filter. You can modify the filter using its configurable parameters.

For example, the following request creates a custom truncate filter, 5_char_trunc, that shortens tokens to a length of 5 or fewer characters:

resp = client.indices.create(
    index="5_char_words_example",
    settings={
        "analysis": {
            "analyzer": {
                "lowercase_5_char": {
                    "tokenizer": "lowercase",
                    "filter": [
                        "5_char_trunc"
                    ]
                }
            },
            "filter": {
                "5_char_trunc": {
                    "type": "truncate",
                    "length": 5
                }
            }
        }
    },
)
print(resp)
response = client.indices.create(
  index: '5_char_words_example',
  body: {
    settings: {
      analysis: {
        analyzer: {
          "lowercase_5_char": {
            tokenizer: 'lowercase',
            filter: [
              '5_char_trunc'
            ]
          }
        },
        filter: {
          "5_char_trunc": {
            type: 'truncate',
            length: 5
          }
        }
      }
    }
  }
)
puts response
const response = await client.indices.create({
  index: "5_char_words_example",
  settings: {
    analysis: {
      analyzer: {
        lowercase_5_char: {
          tokenizer: "lowercase",
          filter: ["5_char_trunc"],
        },
      },
      filter: {
        "5_char_trunc": {
          type: "truncate",
          length: 5,
        },
      },
    },
  },
});
console.log(response);
PUT 5_char_words_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "lowercase_5_char": {
          "tokenizer": "lowercase",
          "filter": [ "5_char_trunc" ]
        }
      },
      "filter": {
        "5_char_trunc": {
          "type": "truncate",
          "length": 5
        }
      }
    }
  }
}
Was this helpful?
Feedback