New

The executive guide to generative AI

Read more

Length token filter

edit

Removes tokens shorter or longer than specified character lengths. For example, you can use the length filter to exclude tokens shorter than 2 characters and tokens longer than 5 characters.

This filter uses Lucene’s LengthFilter.

The length filter removes entire tokens. If you’d prefer to shorten tokens to a specific length, use the truncate filter.

Example

edit

The following analyze API request uses the length filter to remove tokens longer than 4 characters:

resp = client.indices.analyze(
    tokenizer="whitespace",
    filter=[
        {
            "type": "length",
            "min": 0,
            "max": 4
        }
    ],
    text="the quick brown fox jumps over the lazy dog",
)
print(resp)
response = client.indices.analyze(
  body: {
    tokenizer: 'whitespace',
    filter: [
      {
        type: 'length',
        min: 0,
        max: 4
      }
    ],
    text: 'the quick brown fox jumps over the lazy dog'
  }
)
puts response
const response = await client.indices.analyze({
  tokenizer: "whitespace",
  filter: [
    {
      type: "length",
      min: 0,
      max: 4,
    },
  ],
  text: "the quick brown fox jumps over the lazy dog",
});
console.log(response);
GET _analyze
{
  "tokenizer": "whitespace",
  "filter": [
    {
      "type": "length",
      "min": 0,
      "max": 4
    }
  ],
  "text": "the quick brown fox jumps over the lazy dog"
}

The filter produces the following tokens:

[ the, fox, over, the, lazy, dog ]

Add to an analyzer

edit

The following create index API request uses the length filter to configure a new custom analyzer.

resp = client.indices.create(
    index="length_example",
    settings={
        "analysis": {
            "analyzer": {
                "standard_length": {
                    "tokenizer": "standard",
                    "filter": [
                        "length"
                    ]
                }
            }
        }
    },
)
print(resp)
response = client.indices.create(
  index: 'length_example',
  body: {
    settings: {
      analysis: {
        analyzer: {
          standard_length: {
            tokenizer: 'standard',
            filter: [
              'length'
            ]
          }
        }
      }
    }
  }
)
puts response
const response = await client.indices.create({
  index: "length_example",
  settings: {
    analysis: {
      analyzer: {
        standard_length: {
          tokenizer: "standard",
          filter: ["length"],
        },
      },
    },
  },
});
console.log(response);
PUT length_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "standard_length": {
          "tokenizer": "standard",
          "filter": [ "length" ]
        }
      }
    }
  }
}

Configurable parameters

edit
min
(Optional, integer) Minimum character length of a token. Shorter tokens are excluded from the output. Defaults to 0.
max
(Optional, integer) Maximum character length of a token. Longer tokens are excluded from the output. Defaults to Integer.MAX_VALUE, which is 2^31-1 or 2147483647.

Customize

edit

To customize the length filter, duplicate it to create the basis for a new custom token filter. You can modify the filter using its configurable parameters.

For example, the following request creates a custom length filter that removes tokens shorter than 2 characters and tokens longer than 10 characters:

resp = client.indices.create(
    index="length_custom_example",
    settings={
        "analysis": {
            "analyzer": {
                "whitespace_length_2_to_10_char": {
                    "tokenizer": "whitespace",
                    "filter": [
                        "length_2_to_10_char"
                    ]
                }
            },
            "filter": {
                "length_2_to_10_char": {
                    "type": "length",
                    "min": 2,
                    "max": 10
                }
            }
        }
    },
)
print(resp)
response = client.indices.create(
  index: 'length_custom_example',
  body: {
    settings: {
      analysis: {
        analyzer: {
          "whitespace_length_2_to_10_char": {
            tokenizer: 'whitespace',
            filter: [
              'length_2_to_10_char'
            ]
          }
        },
        filter: {
          "length_2_to_10_char": {
            type: 'length',
            min: 2,
            max: 10
          }
        }
      }
    }
  }
)
puts response
const response = await client.indices.create({
  index: "length_custom_example",
  settings: {
    analysis: {
      analyzer: {
        whitespace_length_2_to_10_char: {
          tokenizer: "whitespace",
          filter: ["length_2_to_10_char"],
        },
      },
      filter: {
        length_2_to_10_char: {
          type: "length",
          min: 2,
          max: 10,
        },
      },
    },
  },
});
console.log(response);
PUT length_custom_example
{
  "settings": {
    "analysis": {
      "analyzer": {
        "whitespace_length_2_to_10_char": {
          "tokenizer": "whitespace",
          "filter": [ "length_2_to_10_char" ]
        }
      },
      "filter": {
        "length_2_to_10_char": {
          "type": "length",
          "min": 2,
          "max": 10
        }
      }
    }
  }
}
Was this helpful?
Feedback