Elasticsearch的相关度评分(relevance score)算法采用的是term frequency/inverse document frequency算法,简称为TF/IDF算法。node
算法介绍:算法
GET /test_index/test_type/_search?explain { "query": { "match": { "test_field": "test hello" } } }
{ "took": 1, "timed_out": false, "_shards": { "total": 5, "successful": 5, "failed": 0 }, "hits": { "total": 3, "max_score": 0.843298, "hits": [ { "_shard": "[test_index][2]", "_node": "1LdqLFqxQQq4xg2MphI_gw", "_index": "test_index", "_type": "test_type", "_id": "6", "_score": 0.843298, "_source": { "test_field": "test test" }, "_explanation": { "value": 0.843298, "description": "sum of:", "details": [ { "value": 0.843298, "description": "sum of:", "details": [ { "value": 0.843298, "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.843298, "description": "score(doc=0,freq=2.0 = termFreq=2.0\n), product of:", "details": [ { "value": 0.6931472, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 2, "description": "docFreq", "details": [] }, { "value": 4, "description": "docCount", "details": [] } ] }, { "value": 1.2166219, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 2, "description": "termFreq=2.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 1.75, "description": "avgFieldLength", "details": [] }, { "value": 2.56, "description": "fieldLength", "details": [] } ] } ] } ] } ] }, { "value": 0, "description": "match on required clause, product of:", "details": [ { "value": 0, "description": "# clause", "details": [] }, { "value": 1, "description": "_type:test_type, product of:", "details": [ { "value": 1, "description": "boost", "details": [] }, { "value": 1, "description": "queryNorm", "details": [] } ] } ] } ] } }, { "_shard": "[test_index][1]", "_node": "1LdqLFqxQQq4xg2MphI_gw", "_index": "test_index", "_type": "test_type", "_id": "8", "_score": 0.43445712, "_source": { "test_field": "test client 2" }, "_explanation": { "value": 0.43445715, "description": "sum of:", "details": [ { "value": 0.43445715, "description": "sum of:", "details": [ { "value": 0.43445715, "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.43445715, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.47000363, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 2, "description": "docFreq", "details": [] }, { "value": 3, "description": "docCount", "details": [] } ] }, { "value": 0.92436975, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 3.3333333, "description": "avgFieldLength", "details": [] }, { "value": 4, "description": "fieldLength", "details": [] } ] } ] } ] } ] }, { "value": 0, "description": "match on required clause, product of:", "details": [ { "value": 0, "description": "# clause", "details": [] }, { "value": 1, "description": "_type:test_type, product of:", "details": [ { "value": 1, "description": "boost", "details": [] }, { "value": 1, "description": "queryNorm", "details": [] } ] } ] } ] } }, { "_shard": "[test_index][3]", "_node": "1LdqLFqxQQq4xg2MphI_gw", "_index": "test_index", "_type": "test_type", "_id": "7", "_score": 0.25316024, "_source": { "test_field": "test client 1" }, "_explanation": { "value": 0.25316024, "description": "sum of:", "details": [ { "value": 0.25316024, "description": "sum of:", "details": [ { "value": 0.25316024, "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.25316024, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.2876821, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 1, "description": "docCount", "details": [] } ] }, { "value": 0.88, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 3, "description": "avgFieldLength", "details": [] }, { "value": 4, "description": "fieldLength", "details": [] } ] } ] } ] } ] }, { "value": 0, "description": "match on required clause, product of:", "details": [ { "value": 0, "description": "# clause", "details": [] }, { "value": 1, "description": "*:*, product of:", "details": [ { "value": 1, "description": "boost", "details": [] }, { "value": 1, "description": "queryNorm", "details": [] } ] } ] } ] } } ] } }
在咱们搜索的时候,要依靠倒排索引,可是当咱们排序的时候,须要依靠正排索引。经过倒排索引锁定文档document以后,看到每一个document的每一个field,而后进行排序,所谓的正排索引就是doc values。缓存
对于ES而言,在创建索引的时候,一方面会创建倒排索引,以供搜索使用;一方面会创建正排索引,也就是doc values,以供排序,聚合,过滤等使用。ide
doc values是被保存在磁盘上的,此时若是内存足够,OS操做系统会自动将其缓存在内存中,性能仍是会很高的,若是内存不够用,OS操做系统会将其写入磁盘。post
下面举个例子描述正排索引和倒排索引
假设某个index有两个doc性能
doc1 : hello world you and me
doc2 : hi world, how are you
创建倒排索引fetch
word doc1 doc2 hello * world * * you * * and * me * hi * how * are *
假设某个index有两个docui
doc1: {"name": "jack", "age": 27} doc2: {"name": "tom", "age": 30}
创建正排索引spa
document name age doc1 jack 27 doc2 tom 30
基本原理:操作系统
基本原理:
也就是ES的query pharse是根据priority queue去构建搜索结果的
好比总共有60000条数据,三个primary shard,每一个shard上分了20000条数据,每页是10条数据,这个时候,你要搜索到第1000页,实际上要拿到的是10001~10010,也就是会构建一个10010大小的priority queue。
注意这里千万不要理解成每一个shard都是返回10条数据。这样理解是错误的!
下面作一下详细的分析:
请求首先多是打到一个不包含这个index的shard的node上去,这个node就是一个协调节点coordinate node,那么这个coordinate node就会将搜索请求转发到index的三个shard所在的node上去。好比说咱们以前说的状况下,要搜索60000条数据中的第1000页,实际上每一个shard都要将内部的20000条数据中的第10001~10010条数据,拿出来,不是才10条,是10010条数据。3个shard的每一个shard都返回10010条数据给协调节点coordinate node,coordinate node会收到总共30030条数据,此时会构建一个30030大小的priority queue,而后在这些数据中进行排序,根据_score相关度分数,而后取到10001~10010这10条数据,就是咱们要的第1000页的10条数据。
以下图所示:
想象一下有两个文档有一样值的时间戳字段,搜索结果用 timestamp 字段来排序。 因为搜索请求是在全部有效的分片副本间轮询的,那就有可能发生主分片处理请求时,这两个文档是一种顺序, 而副本分片处理请求时又是另外一种顺序。
bouncing results 问题::每次用户刷新页面,搜索结果表现是不一样的顺序。 让同一个用户始终使用同一个分片,这样能够避免这种问题, 能够设置 preference 参数为一个特定的任意值好比用户会话ID来解决。
偏好这个参数 preference 容许 用来控制由哪些分片或节点来处理搜索请求。 它接受像 _primary, _primary_first, _local, _only_node:xyz, _prefer_node:xyz, 和 _shards:2,3 这样的值, 这些值在 search preference 文档页面被详细解释。
可是最有用的值是某些随机字符串,它能够避免 bouncing results 问题。
在实际应用中,经过from+size不可避免会出现深分页的瓶颈,那么经过scoll技术就是一个很好的解决深分页的方法。好比若是咱们一次性要查出10万条数据,那么使用from+size很显然性能会很是的差,priority queue会很是的大。此时若是采用scroll滚动查询,就能够一批一批的查,直到全部数据都查询完。
scroll原理
scoll搜索会在第一次搜索的时候,保存一个当时的视图快照,以后只会基于该旧的视图快照提供数据搜索,若是这个期间数据变动,是不会让用户看到的。并且ES内部是基于_doc进行排序的方式,性能较高。
示例:
# 使用scroll POST /test_index/_search?scroll=1m { "query": { "match_all": {} }, "sort": [ "_doc" ], "size": 3 }
获取到scroll_id
{ "_scroll_id": "DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAI-sFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACPqxYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3AAAAAAAAj68WMUxkcUxGcXhRUXE0eGcyTXBoSV9ndwAAAAAAAI-tFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACPrhYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3", "took": 3, "timed_out": false, "_shards": { "total": 5, "successful": 5, "failed": 0 }, "hits": { "total": 12, "max_score": null, "hits": [ { "_index": "test_index", "_type": "test_type", "_id": "AWypxxLYFCl_S-ox4wvd", "_score": null, "_source": { "test_content": "my test" }, "sort": [ 0 ] }, { "_index": "test_index", "_type": "test_type", "_id": "6", "_score": null, "_source": { "test_field": "test test" }, "sort": [ 0 ] }, { "_index": "test_index", "_type": "test_type", "_id": "7", "_score": null, "_source": { "test_field": "test client 1" }, "sort": [ 0 ] } ] } }
滚动搜索
# 滚动搜索 POST _search/scroll { "scroll":"1m", "scroll_id":"DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAJDMFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQzRYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3AAAAAAAAkM8WMUxkcUxGcXhRUXE0eGcyTXBoSV9ndwAAAAAAAJDOFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQ0BYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3" }
搜索结果
{ "_scroll_id": "DnF1ZXJ5VGhlbkZldGNoBQAAAAAAAJDMFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQzRYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3AAAAAAAAkM8WMUxkcUxGcXhRUXE0eGcyTXBoSV9ndwAAAAAAAJDOFjFMZHFMRnF4UVFxNHhnMk1waElfZ3cAAAAAAACQ0BYxTGRxTEZxeFFRcTR4ZzJNcGhJX2d3", "took": 1, "timed_out": false, "terminated_early": true, "_shards": { "total": 5, "successful": 5, "failed": 0 }, "hits": { "total": 12, "max_score": null, "hits": [ { "_index": "test_index", "_type": "test_type", "_id": "11", "_score": null, "_source": { "num": 0, "tags": [] }, "sort": [ 0 ] }, { "_index": "test_index", "_type": "test_type", "_id": "8", "_score": null, "_source": { "test_field": "test client 2" }, "sort": [ 1 ] }, { "_index": "test_index", "_type": "test_type", "_id": "4", "_score": null, "_source": { "test_field": "test4" }, "sort": [ 1 ] } ] } }