ES Bulk Exception - circuit breaker errors
# 程序报错信息
2022-10-08 06:39:22,109 7093:139831270405888 xxxx.py:112:INFO:Traceback (most recent call last): | |
File "/users/jalchu/venv/lib/python3.6/site-packages/elasticsearch/helpers/actions.py", line 390, in bulk | |
for ok, item in streaming_bulk(client, actions, *args, **kwargs): | |
File "/users/jalchu/venv/lib/python3.6/site-packages/elasticsearch/helpers/actions.py", line 320, in streaming_bulk | |
**kwargs | |
File "/users/jalchu/venv/lib/python3.6/site-packages/elasticsearch/helpers/actions.py", line 247, in _process_bulk_chunk | |
for item in gen: | |
File "/users/jalchu/venv/lib/python3.6/site-packages/elasticsearch/helpers/actions.py", line 188, in _process_bulk_chunk_success | |
raise BulkIndexError("%i document(s) failed to index." % len(errors), errors) | |
elasticsearch.helpers.errors.BulkIndexError: ('1 document(s) failed to index.', [{'index': {'_index': 'xxxx_index', '_type': '_doc', '_id': '6105733_8f4f8914-2437-46c7-a438-ec599ceca207', 'status': 429, 'error': {'type': 'circuit_breaking_exception', 'reason': '[parent] Data too large, data for [<transport_request>] would be [2123952938/1.9gb], which is larger than the limit of [2040109465/1.8gb], real usage: [2123950800/1.9gb], new bytes reserved: [2138/2kb], usages [request=0/0b, fielddata=7712/7.5kb, in_flight_requests=2138/2kb, accounting=1422929/1.3mb]' |
# 参考官网给出的说明
https://www.elastic.co/guide/en/elasticsearch/reference/7.14/fix-common-cluster-issues.html#circuit-breaker-errors
# 查询 ES 集群中各节点堆使用信息
ES 提供了两个 API 可以实时查看当前 ES 集群中各个节点的堆信息,我们可以从 kibana 页面进入 Dev Tools,执行下面命令。
- cat nodes API
API 地址:https://www.elastic.co/guide/en/elasticsearch/reference/7.14/cat-nodes.html
API 请求:
GET _cat/nodes?v=true&h=name,node*,heap* GET /_cat/nodes?v=true&h=id,ip,port,v,m,node*,heap*
返回:能够看到各节点堆信息,主要看下
heap.percent
:name id node.role heap.current heap.percent heap.max
node-1 54jZ dilm 1.6gb 84 2gb
node-3 goAZ dilm 816.8mb 39 2gb
node-2 5NzV dilm 735.9mb 35 2gb
node-4 Gzzg dilm 673.7mb 32 2gb
node stats API
说明:用来获取每个 circuit breaker 的 jvm memory usage
API 地址:https://www.elastic.co/guide/en/elasticsearch/reference/7.14/cluster-nodes-stats.html
API 请求:
GET _nodes/stats/breaker GET _nodes/stats?filter_path=nodes.*.jvm.mem.pools.old
- 返回:
{
"_nodes" : {
"total" : 4,
"successful" : 4,
"failed" : 0
},
"cluster_name" : "my-application",
"nodes" : {
"54jZKQglTz-L9DhUz7MCTQ" : {
"timestamp" : 1665542452623,
"name" : "node-1",
"transport_address" : "xxxx:9300",
"host" : "xxxx",
"ip" : "xxxx:9300",
"roles" : [
"ingest",
"master",
"data",
"ml"
],
"attributes" : {
"ml.machine_memory" : "33566990336",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true"
},
"breakers" : {
"request" : {
"limit_size_in_bytes" : 1288490188,
"limit_size" : "1.1gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.0,
"tripped" : 0
},
"fielddata" : {
"limit_size_in_bytes" : 858993459,
"limit_size" : "819.1mb",
"estimated_size_in_bytes" : 6648,
"estimated_size" : "6.4kb",
"overhead" : 1.03,
"tripped" : 0
},
"in_flight_requests" : {
"limit_size_in_bytes" : 2147483648,
"limit_size" : "2gb",
"estimated_size_in_bytes" : 1176,
"estimated_size" : "1.1kb",
"overhead" : 2.0,
"tripped" : 0
},
"accounting" : {
"limit_size_in_bytes" : 2147483648,
"limit_size" : "2gb",
"estimated_size_in_bytes" : 1136588,
"estimated_size" : "1mb",
"overhead" : 1.0,
"tripped" : 0
},
"parent" : {
"limit_size_in_bytes" : 2040109465,
"limit_size" : "1.8gb",
"estimated_size_in_bytes" : 328488032,
"estimated_size" : "313.2mb",
"overhead" : 1.0,
"tripped" : 734
}
}
},
"goAZ-YvVS5qOCXCe_ifi6g" : {
"timestamp" : 1665542452649,
"name" : "node-3",
"transport_address" : "xxxx:9300",
"host" : "xxxx",
"ip" : "xxxx:9300",
"roles" : [
"ingest",
"master",
"data",
"ml"
],
"attributes" : {
"ml.machine_memory" : "16655998976",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true"
},
"breakers" : {
"request" : {
"limit_size_in_bytes" : 1288490188,
"limit_size" : "1.1gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.0,
"tripped" : 1
},
"fielddata" : {
"limit_size_in_bytes" : 858993459,
"limit_size" : "819.1mb",
"estimated_size_in_bytes" : 4712,
"estimated_size" : "4.6kb",
"overhead" : 1.03,
"tripped" : 0
},
"in_flight_requests" : {
"limit_size_in_bytes" : 2147483648,
"limit_size" : "2gb",
"estimated_size_in_bytes" : 1176,
"estimated_size" : "1.1kb",
"overhead" : 2.0,
"tripped" : 0
},
"accounting" : {
"limit_size_in_bytes" : 2147483648,
"limit_size" : "2gb",
"estimated_size_in_bytes" : 3777449,
"estimated_size" : "3.6mb",
"overhead" : 1.0,
"tripped" : 0
},
"parent" : {
"limit_size_in_bytes" : 2040109465,
"limit_size" : "1.8gb",
"estimated_size_in_bytes" : 687099680,
"estimated_size" : "655.2mb",
"overhead" : 1.0,
"tripped" : 1172
}
}
},
"5NzVnffXRjaC54hkmCTo3g" : {
"timestamp" : 1665542452644,
"name" : "node-2",
"transport_address" : "xxxx:9300",
"host" : "xxxx",
"ip" : "xxxx:9300",
"roles" : [
"ingest",
"master",
"data",
"ml"
],
"attributes" : {
"ml.machine_memory" : "33566990336",
"xpack.installed" : "true",
"ml.max_open_jobs" : "20"
},
"breakers" : {
"request" : {
"limit_size_in_bytes" : 1288490188,
"limit_size" : "1.1gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.0,
"tripped" : 0
},
"fielddata" : {
"limit_size_in_bytes" : 858993459,
"limit_size" : "819.1mb",
"estimated_size_in_bytes" : 4872,
"estimated_size" : "4.7kb",
"overhead" : 1.03,
"tripped" : 0
},
"in_flight_requests" : {
"limit_size_in_bytes" : 2147483648,
"limit_size" : "2gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 2.0,
"tripped" : 0
},
"accounting" : {
"limit_size_in_bytes" : 2147483648,
"limit_size" : "2gb",
"estimated_size_in_bytes" : 1422122,
"estimated_size" : "1.3mb",
"overhead" : 1.0,
"tripped" : 0
},
"parent" : {
"limit_size_in_bytes" : 2040109465,
"limit_size" : "1.8gb",
"estimated_size_in_bytes" : 1707026432,
"estimated_size" : "1.5gb",
"overhead" : 1.0,
"tripped" : 284
}
}
},
"GzzgYowlQyicQQZeN7ykQw" : {
"timestamp" : 1665542452642,
"name" : "node-4",
"transport_address" : "xxxx:9300",
"host" : "xxxx",
"ip" : "xxxx:9300",
"roles" : [
"ingest",
"master",
"data",
"ml"
],
"attributes" : {
"ml.machine_memory" : "16655998976",
"ml.max_open_jobs" : "20",
"xpack.installed" : "true"
},
"breakers" : {
"request" : {
"limit_size_in_bytes" : 1288490188,
"limit_size" : "1.1gb",
"estimated_size_in_bytes" : 0,
"estimated_size" : "0b",
"overhead" : 1.0,
"tripped" : 0
},
"fielddata" : {
"limit_size_in_bytes" : 858993459,
"limit_size" : "819.1mb",
"estimated_size_in_bytes" : 6080,
"estimated_size" : "5.9kb",
"overhead" : 1.03,
"tripped" : 0
},
"in_flight_requests" : {
"limit_size_in_bytes" : 2147483648,
"limit_size" : "2gb",
"estimated_size_in_bytes" : 1176,
"estimated_size" : "1.1kb",
"overhead" : 2.0,
"tripped" : 0
},
"accounting" : {
"limit_size_in_bytes" : 2147483648,
"limit_size" : "2gb",
"estimated_size_in_bytes" : 3537488,
"estimated_size" : "3.3mb",
"overhead" : 1.0,
"tripped" : 0
},
"parent" : {
"limit_size_in_bytes" : 2040109465,
"limit_size" : "1.8gb",
"estimated_size_in_bytes" : 652858992,
"estimated_size" : "622.6mb",
"overhead" : 1.0,
"tripped" : 792
}
}
}
}
}
# 减轻 jvm 内存压力
如果上述查询的节点中有节点内存使用率超过 85%,ES 认为该集群就是不健康的集群,就有可能会遇到 circuit breaker errors,所以我们要做一些处理。
# 诊断 JVM 高内存占用情况
使用 ES API 查询 GET _nodes/stats?filter_path=nodes.*.jvm.mem.pools.old
JVM Memory Pressure = used_in_bytes / max_in_bytes
随着内存使用增加,jvm 的 gc 会变得越来越频繁,gc 事件可以从 elasticsearch.log 里找到,例如,以下事件表明 Elasticsearch 在过去 40 秒中花费了超过 50%(21 秒)执行垃圾收集。
[timestamp_short_interval_from_last][INFO ][o.e.m.j.JvmGcMonitorService] [node_id] [gc][number] overhead, spent [21s] collecting in the last [40s] |
# 减轻 jvm 内存压力
- 减少 shard 数量,可以设置少量的分片,每个分片占用较大的内存,这样比多分片小内存划得来
- 避免昂贵的查询,这些查询会占用更多内存,可以参考 slow log,通常可以通过一些配置限制这些客户端查询。比如:
- 减少 index.max_result_window 大小
- 减少 search.max_buckets 集群设置的聚合桶的最大数量。
- 使用 search.allow_expensive_queries 集群设置禁用昂贵的查询。
- 定义过多的字段或嵌套过深的字段会导致使用大量内存的处理映射关系。在 ES 里直译为 “映射爆炸”mapping explosions,可以使用 mapping limit settings 限制属性映射。
- 分批请求数,bulk indexing 和 multi-search 会导致高内存占用,如果可以的话,尽量少使用。
- 提升节点内存
- 避免在 text 类型属性上使用 fielddata,对于 text 属性,es 默认禁用 fielddata,尽可能用
keyword
。(参考, 如果启用了 fielddata,可以使用POST _cache/clear?fielddata=true
这个 API 清理 cache。 - 尽量避免 delete_by_query 删除文档,更好的方案是直接删除索引。
# 实际操作
从上面一些信息来看,我们的 es 节点只分配了 2g 的 jvm 内存空间。我们先看下我们四个节点服务器上的内存情况(不看 swap)。
节点1:内存32G,22G空闲,6.3G使用,4.3G buff/cache,es节点内存占用小于10%
节点2:内存32G,22G空闲,4.7G使用,5.2G buff/cache,es节点内存占用小于10%
节点3:内存16G,283M空闲,3G+使用,12G buff/cache,es节点内存占用>20%
节点4:内存16G,2.6G空闲,约4G使用,9.7G buff/cache,es节点内存占用>20%
PS 下 java 进程,主要看下 es 依赖的 java 运行环境是内嵌的还是外部的。
[root@node3 config]# ps -efl|grep java | |
0 S esowner 8300 1 0 80 0 - 2605170 futex_ 2021 ? 3-05:19:40 /opt/es/elasticsearch-7.6.1/jdk/bin/java -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dio.netty.allocator.numDirectArenas=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.locale.providers=COMPAT -Xms2g -Xmx2g -XX:+UseG1GC -XX:CMSInitiatingOccupancyFraction=75 -XX:+UseCMSInitiatingOccupancyOnly -Djava.io.tmpdir=/opt/es/elasticsearch-7.6.1/tmp -Djna.tmpdir=/opt/es/elasticsearch-7.6.1/tmp -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m -XX:MaxDirectMemorySize=1073741824 -Des.path.home=/opt/es/elasticsearch-7.6.1 -Des.path.conf=/opt/es/elasticsearch-7.6.1/config -Des.distribution.flavor=default -Des.distribution.type=tar -Des.bundled_jdk=true -cp /opt/es/elasticsearch-7.6.1/lib/* org.elasticsearch.bootstrap.Elasticsearch -d | |
0 S root 22247 20981 0 80 0 - 28179 pipe_w 03:51 pts/0 00:00:00 grep --color=auto java | |
[root@node3 config]# |
从上面看出 jdk 在 es 目录下,应该是内嵌的 jre 环境,直接配置下 /elasticsearch-7.6.1/config/jvm.options 里堆大小应该就行了。
但是我们要先看一个问题,就是上面节点 3 的空闲内存明显太少了,而有很多内存是在 buff 中,有点不健康。
[root@node3 jalchu]# free -mh | |
total used free shared buff/cache available | |
Mem: 15G 3.5G 280M 800M 11G 10G | |
Swap: 3.0G 1.5M 3.0G | |
[root@node3 jalchu]# ps -efl|grep python | |
4 S root 1487 1 0 80 0 - 143485 poll_s 2021 ? 00:58:12 /usr/bin/python2 -Es /usr/sbin/tuned -l -P | |
4 S root 30270 29618 0 80 0 - 28179 pipe_w 05:27 pts/0 00:00:00 grep --color=auto python | |
[root@node3 jalchu]# |
node3 本身只有一个 ES 的 java 进程,1 各 redis 哨兵进程,查看对外暴漏的 TCP 连接端口,看上去只有 SMTP 服务 25、哨兵服务 26379、ES 通信服务(9200、9300)、ssh 服务端口 22。
[root@spoc-ta-job-3 jalchu]# netstat -tlun | |
Active Internet connections (only servers) | |
Proto Recv-Q Send-Q Local Address Foreign Address State | |
tcp 0 0 127.0.0.1:25 0.0.0.0:* LISTEN | |
tcp 0 0 0.0.0.0:43034 0.0.0.0:* LISTEN | |
tcp 0 0 0.0.0.0:26379 0.0.0.0:* LISTEN | |
tcp 0 0 10.241.94.34:9200 0.0.0.0:* LISTEN | |
tcp 0 0 10.241.94.34:9300 0.0.0.0:* LISTEN | |
tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN | |
udp 0 0 0.0.0.0:68 0.0.0.0:* | |
udp 0 0 10.241.94.34:123 0.0.0.0:* | |
udp 0 0 127.0.0.1:123 0.0.0.0:* | |
udp 0 0 0.0.0.0:123 0.0.0.0:* |
查看服务器进程持有的文件句柄数量,因为 es 很吃这块性能,所以查一下(PS 这个命令是拷贝项目组前辈的)。
[root@spoc-ta-job-3 jalchu]# lsof|awk '{print $2}'|sort|uniq -c|sort -nr|head -n 20
123256 8300
490 1208
465 1487
392 1161
246 1225
240 8620
129 2077
120 1860
105 1903
94 1169
90 8322
87 29557
84 29543
78 13744
71 1
68 1130
58 13749
56 1863
53 1443
52 7269
8300 就是我们 es 进程。尝试重启 es 先,然后设置 heap size。
kill 掉 es 后内存占用情况: KiB Mem : 16265624 total, 4913864 free, 353968 used, 10997792 buff/cache
# 给下 es 操作用户权限 | |
[root@spoc-ta-job-3 jalchu]# chown -R esowner:esowner /opt/es/elasticsearch-7.6.1/ | |
# 切换 es 用户 | |
[root@spoc-ta-job-3 jalchu]# su esowner | |
[esowner@spoc-ta-job-3 jalchu]$ /opt/es/elasticsearch-7.6.1/bin/elasticsearch -d |
重启后再查看内存占用情况: KiB Mem : 16265624 total, 2558356 free, 2861784 used, 10845484 buff/cache
调整 jvm.options,两台 16G 的我们分个 3G 给 es,两个 32G 的我们分个 6G 给 es
-Xms3g
-Xmx3g
暂时先这样。后续观察~
# 参考
https://www.elastic.co/guide/en/elasticsearch/reference/7.14/fix-common-cluster-issues.html#circuit-breaker-errors
https://blog.csdn.net/laoyang360/article/details/123059159