001 ES suggest-IK 中文
此次操作的结果是,在使用ik_smart作为analyzer和search_analyzer的情况下
(1)prefix completion 效果较为理想
(2)term / phrase suggest 均没有得到想要的效果
为什么用ik_smart? 不是ik_max_word?
ik_max_word是最大颗粒度的进行分词,对我们检索的关键字会进行最大颗粒度的分词,鉴于es进行suggest的机制,在进行最大颗粒度分词后,suggest效果应该不会理想
实际上,在设置ik_max_word分词后确实也没有得到想要的效果
POST /ocr2/_analyze
{
"analyzer": "ik_smart",
"text": ["好记性使用手册不如烂笔头感叹号博客园"]
}
# result
{
"tokens" : [
{
"token" : "好记性",
"start_offset" : 0,
"end_offset" : 3,
"type" : "CN_WORD",
"position" : 0
},
{
"token" : "使用手册",
"start_offset" : 3,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "不如",
"start_offset" : 7,
"end_offset" : 9,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "烂",
"start_offset" : 9,
"end_offset" : 10,
"type" : "CN_CHAR",
"position" : 3
},
{
"token" : "笔头",
"start_offset" : 10,
"end_offset" : 12,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "博客园",
"start_offset" : 15,
"end_offset" : 18,
"type" : "CN_WORD",
"position" : 5
}
]
}
PUT /ocr2
{
"mappings" : {
"properties" : {
"text": {
"type": "text",
"analyzer":"ik_smart",
"search_analyzer":"ik_smart"
},
"pageid": {
"type": "completion",
"search_analyzer": "ik_max_word",
"analyzer": "ik_max_word"
}
}
}
}
POST /ocr2/_create/1
{"pageid":"好记性使用手册不如烂笔头感叹号博客园", "text":"好记性使用手册不如烂笔头感叹号博客园"}
POST /ocr2/_create/2
{"pageid":"好记性使用手川不如烂笔头感叹号博客园", "text":"好记性使用手册不如烂笔头感叹号博客园"}
POST /ocr2/_create/3
{"pageid":"好记性使用手川不如烂笔头感叹号博客园", "text":"好记性使用手川不如烂笔头感叹号博客园"}
POST /ocr2/_search
{
"suggest": {
"my-suggestion": {
"prefix": "好记性",
"completion": {
"field": "pageid"
}
}
}
}
# result,从结果可见,前缀匹配结果较为理想,index中的数据都得到了匹配
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"my-suggestion" : [
{
"text" : "好记性",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "好记性使用手册不如烂笔头感叹号博客园",
"_index" : "ocr2",
"_type" : "_doc",
"_id" : "1",
"_score" : 1.0,
"_source" : {
"pageid" : "好记性使用手册不如烂笔头感叹号博客园",
"text" : "好记性使用手册不如烂笔头感叹号博客园"
}
},
{
"text" : "好记性使用手川不如烂笔头感叹号博客园",
"_index" : "ocr2",
"_type" : "_doc",
"_id" : "2",
"_score" : 1.0,
"_source" : {
"pageid" : "好记性使用手川不如烂笔头感叹号博客园",
"text" : "好记性使用手册不如烂笔头感叹号博客园"
}
},
{
"text" : "好记性使用手川不如烂笔头感叹号博客园",
"_index" : "ocr2",
"_type" : "_doc",
"_id" : "3",
"_score" : 1.0,
"_source" : {
"pageid" : "好记性使用手川不如烂笔头感叹号博客园",
"text" : "好记性使用手川不如烂笔头感叹号博客园"
}
}
]
}
]
}
}
POST /ocr2/_search
{
"suggest": {
"my-suggestion": {
"text": "使用手川",
"term": {
"field": "text",
"suggest_mode": "always"
}
}
}
}
# result
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"my-suggestion" : [
{
"text" : "用手",
"offset" : 1,
"length" : 2,
"options" : [ ]
},
{
"text" : "川",
"offset" : 3,
"length" : 1,
"options" : [ ]
}
]
}
}
POST /ocr2/_search
{
"suggest": {
"my-suggestion": {
"text": "使用手川",
"phrase": {
"field": "text"
}
}
}
}
# result
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"my-suggestion" : [
{
"text" : "使用手川",
"offset" : 0,
"length" : 4,
"options" : [ ]
}
]
}
}