HTML strip character filter
editHTML strip character filter
editStrips HTML elements from a text and replaces HTML entities with their decoded
value (e.g, replaces &
with &
).
The html_strip
filter uses Lucene’s
HTMLStripCharFilter.
Example
editThe following analyze API request uses the
html_strip
filter to change the text <p>I'm so <b>happy</b>!</p>
to
\nI'm so happy!\n
.
resp = client.indices.analyze( tokenizer="keyword", char_filter=[ "html_strip" ], text="I'm so happy</b>!</p>", ) print(resp)
response = client.indices.analyze( body: { tokenizer: 'keyword', char_filter: [ 'html_strip' ], text: 'I'm so happy</b>!</p>' } ) puts response
const response = await client.indices.analyze({ tokenizer: "keyword", char_filter: ["html_strip"], text: "I'm so happy</b>!</p>", }); console.log(response);
GET /_analyze { "tokenizer": "keyword", "char_filter": [ "html_strip" ], "text": "<p>I'm so <b>happy</b>!</p>" }
The filter produces the following text:
[ \nI'm so happy!\n ]
Add to an analyzer
editThe following create index API request uses the
html_strip
filter to configure a new
custom analyzer.
resp = client.indices.create( index="my-index-000001", settings={ "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "keyword", "char_filter": [ "html_strip" ] } } } }, ) print(resp)
response = client.indices.create( index: 'my-index-000001', body: { settings: { analysis: { analyzer: { my_analyzer: { tokenizer: 'keyword', char_filter: [ 'html_strip' ] } } } } } ) puts response
const response = await client.indices.create({ index: "my-index-000001", settings: { analysis: { analyzer: { my_analyzer: { tokenizer: "keyword", char_filter: ["html_strip"], }, }, }, }, }); console.log(response);
PUT /my-index-000001 { "settings": { "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "keyword", "char_filter": [ "html_strip" ] } } } } }
Configurable parameters
edit-
escaped_tags
-
(Optional, array of strings)
Array of HTML elements without enclosing angle brackets (
< >
). The filter skips these HTML elements when stripping HTML from the text. For example, a value of[ "p" ]
skips the<p>
HTML element.
Customize
editTo customize the html_strip
filter, duplicate it to create the basis for a new
custom character filter. You can modify the filter using its configurable
parameters.
The following create index API request
configures a new custom analyzer using a custom
html_strip
filter, my_custom_html_strip_char_filter
.
The my_custom_html_strip_char_filter
filter skips the removal of the <b>
HTML element.
resp = client.indices.create( index="my-index-000001", settings={ "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "keyword", "char_filter": [ "my_custom_html_strip_char_filter" ] } }, "char_filter": { "my_custom_html_strip_char_filter": { "type": "html_strip", "escaped_tags": [ "b" ] } } } }, ) print(resp)
response = client.indices.create( index: 'my-index-000001', body: { settings: { analysis: { analyzer: { my_analyzer: { tokenizer: 'keyword', char_filter: [ 'my_custom_html_strip_char_filter' ] } }, char_filter: { my_custom_html_strip_char_filter: { type: 'html_strip', escaped_tags: [ 'b' ] } } } } } ) puts response
const response = await client.indices.create({ index: "my-index-000001", settings: { analysis: { analyzer: { my_analyzer: { tokenizer: "keyword", char_filter: ["my_custom_html_strip_char_filter"], }, }, char_filter: { my_custom_html_strip_char_filter: { type: "html_strip", escaped_tags: ["b"], }, }, }, }, }); console.log(response);
PUT my-index-000001 { "settings": { "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "keyword", "char_filter": [ "my_custom_html_strip_char_filter" ] } }, "char_filter": { "my_custom_html_strip_char_filter": { "type": "html_strip", "escaped_tags": [ "b" ] } } } } }