Version: User Guides (Cloud)

English

The english analyzer in Zilliz Cloud is designed to process English text, applying language-specific rules for tokenization and filtering.

Definition

The english analyzer uses the following components:

Tokenizer: Uses the standard tokenizer to split text into discrete word units.
Filters: Includes multiple filters for comprehensive text processing:
- lowercase: Converts all tokens to lowercase, enabling case-insensitive searches.
- stemmer: Reduces words to their root form to support broader matching (e.g., "running" becomes "run").
- stop_words: Removes common English stop words to focus on key terms in text.

The functionality of the english analyzer is equivalent to the following custom analyzer configuration:

Python
Java
NodeJS
Go
cURL

analyzer_params = {
        "tokenizer": "standard",
        "filter": [
                "lowercase",
                {
                        "type": "stemmer",
                        "language": "english"
                }, {
                        "type": "stop",
                        "stop_words": "_english_"
                }
        ]
}

Map<String, Object> analyzerParams = new HashMap<>();
analyzerParams.put("tokenizer", "standard");
analyzerParams.put("filter",
        Arrays.asList("lowercase",
                new HashMap<String, Object>() {{
                    put("type", "stemmer");
                    put("language", "english");
                }},
                new HashMap<String, Object>() {{
                    put("type", "stop");
                    put("stop_words", Collections.singletonList("_english_"));
                }}
        )
);

const analyzer_params = {
    "type": "standard", // Specifies the standard analyzer type
    "stop_words", ["of"] // Optional: List of words to exclude from tokenization
}

analyzerParams = map[string]any{"tokenizer": "standard",
        "filter": []any{"lowercase", map[string]any{
            "type":     "stemmer",
            "language": "english",
        }, map[string]any{
            "type":       "stop",
            "stop_words": "_english_",
        }}}

# restful
analyzerParams='{
  "tokenizer": "standard",
  "filter": [
    "lowercase",
    {
      "type": "stemmer",
      "language": "english"
    },
    {
      "type": "stop",
      "stop_words": "_english_"
    }
  ]
}'

Configuration

To apply the english analyzer to a field, simply set type to english in analyzer_params, and include optional parameters as needed.

Python
Java
NodeJS
Go
cURL

analyzer_params = {
    "type": "english",
}

Map<String, Object> analyzerParams = new HashMap<>();
analyzerParams.put("type", "english");

const analyzer_params = {
    "type": "english",
}

analyzerParams = map[string]any{"type": "english"}

# restful
analyzerParams='{
  "type": "english"
}'

The english analyzer accepts the following optional parameters:

Parameter	Description
`stop_words`	An array containing a list of stop words, which will be removed from tokenization. Defaults to `english`, a built-in set of common English stop words.

Example configuration with custom stop words:

Python
Java
NodeJS
Go
cURL

analyzer_params = {
    "type": "english",
    "stop_words": ["a", "an", "the"]
}

Map<String, Object> analyzerParams = new HashMap<>();
analyzerParams.put("type", "english");
analyzerParams.put("stop_words", Arrays.asList("a", "an", "the"));

const analyzer_params = {
    "type": "english",
    "stop_words": ["a", "an", "the"]
}

analyzerParams = map[string]any{"type": "english", "stop_words": []string{"a", "an", "the"}}

# restful
analyzerParams='{
  "type": "english",
  "stop_words": [
    "a",
    "an",
    "the"
  ]
}'

After defining analyzer_params, you can apply them to a VARCHAR field when defining a collection schema. This allows Zilliz Cloud to process the text in that field using the specified analyzer for efficient tokenization and filtering. For details, refer to Example use.

Examples

Before applying the analyzer configuration to your collection schema, verify its behavior using the run_analyzer method.

Analyzer configuration

Python
Java
NodeJS
Go
cURL

analyzer_params = {
    "type": "english",
    "stop_words": ["a", "an", "the"]
}

Map<String, Object> analyzerParams = new HashMap<>();
analyzerParams.put("type", "english");
analyzerParams.put("stop_words", Arrays.asList("a", "an", "the"));

// javascript

analyzerParams = map[string]any{"type": "english", "stop_words": []string{"a", "an", "the"}}

# restful
analyzerParams='{
  "type": "english",
  "stop_words": [
    "a",
    "an",
    "the"
  ]
}'

Verification using `run_analyzer`

Python
Java
NodeJS
Go
cURL

from pymilvus import (
    MilvusClient,
)

client = MilvusClient(
    uri="YOUR_CLUSTER_ENDPOINT",
    token="YOUR_CLUSTER_TOKEN"
)

# Sample text to analyze
sample_text = "Milvus is a vector database built for scale!"

# Run the standard analyzer with the defined configuration
result = client.run_analyzer(sample_text, analyzer_params)
print("English analyzer output:", result)

import io.milvus.v2.client.ConnectConfig;
import io.milvus.v2.client.MilvusClientV2;
import io.milvus.v2.service.vector.request.RunAnalyzerReq;
import io.milvus.v2.service.vector.response.RunAnalyzerResp;

ConnectConfig config = ConnectConfig.builder()
        .uri("YOUR_CLUSTER_ENDPOINT")
        .token("YOUR_CLUSTER_TOKEN")
        .build();
MilvusClientV2 client = new MilvusClientV2(config);

List<String> texts = new ArrayList<>();
texts.add("Milvus is a vector database built for scale!");

RunAnalyzerResp resp = client.runAnalyzer(RunAnalyzerReq.builder()
        .texts(texts)
        .analyzerParams(analyzerParams)
        .build());
List<RunAnalyzerResp.AnalyzerResult> results = resp.getResults();

// javascript

import (
    "context"
    "encoding/json"
    "fmt"

    "github.com/milvus-io/milvus/client/v2/milvusclient"
)

client, err := milvusclient.New(ctx, &milvusclient.ClientConfig{
    Address: "YOUR_CLUSTER_ENDPOINT",
    APIKey:  "YOUR_CLUSTER_TOKEN",
})
if err != nil {
    fmt.Println(err.Error())
    // handle error
}

bs, _ := json.Marshal(analyzerParams)
texts := []string{"Milvus is a vector database built for scale!"}
option := milvusclient.NewRunAnalyzerOption(texts).
    WithAnalyzerParams(string(bs))

result, err := client.RunAnalyzer(ctx, option)
if err != nil {
    fmt.Println(err.Error())
    // handle error
}

# restful

Expected output

English analyzer output: ['milvus', 'vector', 'databas', 'built', 'scale']

Definition​

Configuration​

Examples​

Analyzer configuration​

Verification using run_analyzer​