Merge pull request #99 from s-allius/health_check

Health check
This commit is contained in:
Stefan Allius
2024-06-16 23:08:12 +02:00
committed by GitHub
14 changed files with 168 additions and 55 deletions

View File

@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [unreleased]
- Add healthcheck, readiness and liveness checks [#91](https://github.com/s-allius/tsun-gen3-proxy/issues/91)
- MODBUS close handler releases internal resource [#93](https://github.com/s-allius/tsun-gen3-proxy/issues/93)
- add exception handling for message forwarding [#94](https://github.com/s-allius/tsun-gen3-proxy/issues/94)
- GEN3: make timestamp handling stateless, to avoid blocking when the TSUN cloud is down [#56](https://github.com/s-allius/tsun-gen3-proxy/issues/56)
@@ -29,7 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- catch all OSError errors in the read loop
- log Modbus traces with different log levels
- add Modbus fifo and timeout handler
- build version string in the same format as TSUN for GEN3 invterts
- build version string in the same format as TSUN for GEN3 inverters
- add graceful shutdown
- parse Modbus values and store them in the database
- add cron task to request the output power every minute

View File

@@ -64,7 +64,7 @@ COPY --chmod=0700 entrypoint.sh /root/entrypoint.sh
COPY config .
COPY src .
RUN date > /build-date.txt
EXPOSE 5005
EXPOSE 5005 8127 10000
# command to run on container start
ENTRYPOINT ["/root/entrypoint.sh"]

View File

@@ -17,6 +17,5 @@ if [ "$environment" = "production" ] ; then \
-name od -o \
-name strings -o \
-name su -o \
-name wget -o \
\) -delete \
; fi

View File

@@ -1,3 +1,4 @@
aiomqtt==2.0.1
schema==0.7.5
aiocron==1.8
aiocron==1.8
aiohttp==3.9.5

View File

@@ -1,9 +1,11 @@
import logging
import traceback
import time
from asyncio import StreamReader, StreamWriter
from messages import hex_dump_memory
from messages import hex_dump_memory, State
from typing import Self
import gc
logger = logging.getLogger('conn')
@@ -17,6 +19,8 @@ class AsyncStream():
self.addr = addr
self.r_addr = ''
self.l_addr = ''
self.proc_start = None # start processing start timestamp
self.proc_max = 0
async def server_loop(self, addr: str) -> None:
'''Loop for receiving messages from the inverter (server-side)'''
@@ -61,8 +65,14 @@ class AsyncStream():
"""Async loop handler for precessing all received messages"""
self.r_addr = self.writer.get_extra_info('peername')
self.l_addr = self.writer.get_extra_info('sockname')
self.proc_start = time.time()
while True:
try:
proc = time.time() - self.proc_start
if proc > self.proc_max:
self.proc_max = proc
self.proc_start = None
await self.__async_read()
if self.unique_id:
@@ -117,6 +127,17 @@ class AsyncStream():
logger.debug(f'AsyncStream.close() l{self.l_addr} | r{self.r_addr}')
self.writer.close()
def healthy(self) -> bool:
elapsed = 0
if self.proc_start is not None:
elapsed = time.time() - self.proc_start
if self.state == State.closed or elapsed > 1:
logging.debug(f'[{self.node_id}:{type(self).__name__}]'
f' act:{round(1000*elapsed)}ms'
f' max:{round(1000*self.proc_max)}ms')
logging.debug(f'Healthy()) refs: {gc.get_referrers(self)}')
return elapsed < 5
'''
Our private methods
'''
@@ -124,6 +145,7 @@ class AsyncStream():
"""Async read handler to read received data from TCP stream"""
data = await self.reader.read(4096)
if data:
self.proc_start = time.time()
self._recv_buffer += data
self.read() # call read in parent class
else:

View File

@@ -31,6 +31,10 @@ class ConnectionG3(AsyncStream, Talent):
async def async_publ_mqtt(self) -> None:
pass
def healthy(self) -> bool:
logger.debug('ConnectionG3 healthy()')
return AsyncStream.healthy(self)
'''
Our private methods
'''

View File

@@ -4,12 +4,12 @@ import time
from datetime import datetime
if __name__ == "app.src.gen3.talent":
from app.src.messages import hex_dump_memory, Message
from app.src.messages import hex_dump_memory, Message, State
from app.src.modbus import Modbus
from app.src.config import Config
from app.src.gen3.infos_g3 import InfosG3
else: # pragma: no cover
from messages import hex_dump_memory, Message
from messages import hex_dump_memory, Message, State
from modbus import Modbus
from config import Config
from gen3.infos_g3 import InfosG3
@@ -77,7 +77,7 @@ class Talent(Message):
# deallocated by the garbage collector ==> we get a memory leak
self.switch.clear()
self.log_lvl.clear()
self.state = self.STATE_CLOSED
self.state = State.closed
super().close()
def __set_serial_no(self, serial_no: str):
@@ -114,6 +114,9 @@ class Talent(Message):
if self.header_valid and len(self._recv_buffer) >= (self.header_len +
self.data_len):
if self.state == State.init:
self.state = State.received
log_lvl = self.log_lvl.get(self.msg_id, logging.WARNING)
if callable(log_lvl):
log_lvl = log_lvl()
@@ -141,7 +144,7 @@ class Talent(Message):
return
def send_modbus_cb(self, modbus_pdu: bytearray, log_lvl: int, state: str):
if self.state != self.STATE_UP:
if self.state != State.up:
logger.warning(f'[{self.node_id}] ignore MODBUS cmd,'
' cause the state is not UP anymore')
return
@@ -158,7 +161,7 @@ class Talent(Message):
self._send_buffer = bytearray(0) # self._send_buffer[sent:]
async def send_modbus_cmd(self, func, addr, val, log_lvl) -> None:
if self.state != self.STATE_UP:
if self.state != State.up:
logger.log(log_lvl, f'[{self.node_id}] ignore MODBUS cmd,'
' as the state is not UP')
return
@@ -371,7 +374,7 @@ class Talent(Message):
self._send_buffer += b'\x01'
self.__finish_send_msg()
self.__process_data()
self.state = self.STATE_UP
self.state = State.up
elif self.ctrl.is_resp():
return # ignore received response
@@ -387,7 +390,7 @@ class Talent(Message):
self._send_buffer += b'\x01'
self.__finish_send_msg()
self.__process_data()
self.state = self.STATE_UP
self.state = State.up
elif self.ctrl.is_resp():
return # ignore received response

View File

@@ -31,6 +31,10 @@ class ConnectionG3P(AsyncStream, SolarmanV5):
async def async_publ_mqtt(self) -> None:
pass
def healthy(self) -> bool:
logger.debug('ConnectionG3P healthy()')
return AsyncStream.healthy(self)
'''
Our private methods
'''

View File

@@ -6,13 +6,13 @@ import asyncio
from datetime import datetime
if __name__ == "app.src.gen3plus.solarman_v5":
from app.src.messages import hex_dump_memory, Message
from app.src.messages import hex_dump_memory, Message, State
from app.src.modbus import Modbus
from app.src.config import Config
from app.src.gen3plus.infos_g3p import InfosG3P
from app.src.infos import Register
else: # pragma: no cover
from messages import hex_dump_memory, Message
from messages import hex_dump_memory, Message, State
from config import Config
from modbus import Modbus
from gen3plus.infos_g3p import InfosG3P
@@ -135,7 +135,7 @@ class SolarmanV5(Message):
# deallocated by the garbage collector ==> we get a memory leak
self.switch.clear()
self.log_lvl.clear()
self.state = self.STATE_CLOSED
self.state = State.closed
super().close()
def __set_serial_no(self, snr: int):
@@ -184,6 +184,9 @@ class SolarmanV5(Message):
self._recv_buffer, self.header_len+self.data_len+2)
if self.__trailer_is_ok(self._recv_buffer, self.header_len
+ self.data_len + 2):
if self.state == State.init:
self.state = State.received
self.__set_serial_no(self.snr)
self.__dispatch_msg()
self.__flush_recv_msg()
@@ -345,7 +348,7 @@ class SolarmanV5(Message):
self.__finish_send_msg()
def send_modbus_cb(self, pdu: bytearray, log_lvl: int, state: str):
if self.state != self.STATE_UP:
if self.state != State.up:
logger.warning(f'[{self.node_id}] ignore MODBUS cmd,'
' cause the state is not UP anymore')
return
@@ -360,7 +363,7 @@ class SolarmanV5(Message):
self._send_buffer = bytearray(0) # self._send_buffer[sent:]
async def send_modbus_cmd(self, func, addr, val, log_lvl) -> None:
if self.state != self.STATE_UP:
if self.state != State.up:
logger.log(log_lvl, f'[{self.node_id}] ignore MODBUS cmd,'
' as the state is not UP')
return
@@ -371,7 +374,7 @@ class SolarmanV5(Message):
cmd.startswith(tuple(self.at_acl[connection]['block']))
async def send_at_cmd(self, AT_cmd: str) -> None:
if self.state != self.STATE_UP:
if self.state != State.up:
logger.warning(f'[{self.node_id}] ignore AT+ cmd,'
' as the state is not UP')
return
@@ -471,7 +474,7 @@ class SolarmanV5(Message):
self.__process_data(ftype)
self.__forward_msg()
self.__send_ack_rsp(0x1210, ftype)
self.state = self.STATE_UP
self.state = State.up
def msg_sync_start(self):
data = self._recv_buffer[self.header_len:]
@@ -567,7 +570,7 @@ class SolarmanV5(Message):
self.__forward_msg()
self.__send_ack_rsp(0x1710, ftype)
self.state = self.STATE_UP
self.state = State.up
def msg_sync_end(self):
data = self._recv_buffer[self.header_len:]

View File

@@ -1,6 +1,7 @@
import logging
import weakref
from typing import Callable, Generator
from enum import Enum
if __name__ == "app.src.messages":
@@ -52,11 +53,20 @@ class IterRegistry(type):
yield obj
class State(Enum):
'''state of the logical connection'''
init = 0
'''just created'''
received = 1
'''at least one packet received'''
up = 2
'''at least one cmd-rsp transaction'''
closed = 3
'''connection closed'''
class Message(metaclass=IterRegistry):
_registry = []
STATE_INIT = 0
STATE_UP = 2
STATE_CLOSED = 3
def __init__(self, server_side: bool, send_modbus_cb:
Callable[[bytes, int, str], None], mb_timeout: int):
@@ -78,7 +88,7 @@ class Message(metaclass=IterRegistry):
self._send_buffer = bytearray(0)
self._forward_buffer = bytearray(0)
self.new_data = {}
self.state = self.STATE_INIT
self.state = State.init
'''
Empty methods, that have to be implemented in any child class which

View File

@@ -3,6 +3,7 @@ import asyncio
import signal
import os
from asyncio import StreamReader, StreamWriter
from aiohttp import web
from logging import config # noqa F401
from messages import Message
from inverter import Inverter
@@ -11,6 +12,51 @@ from gen3plus.inverter_g3p import InverterG3P
from scheduler import Schedule
from config import Config
routes = web.RouteTableDef()
proxy_is_up = False
@routes.get('/')
async def hello(request):
return web.Response(text="Hello, world")
@routes.get('/-/ready')
async def ready(request):
if proxy_is_up:
status = 200
text = 'Is ready'
else:
status = 503
text = 'Not ready'
return web.Response(status=status, text=text)
@routes.get('/-/healthy')
async def healthy(request):
if proxy_is_up:
# logging.info('web reqeust healthy()')
for stream in Message:
try:
res = stream.healthy()
if not res:
return web.Response(status=503, text="I have a problem")
except Exception as err:
logging.info(f'Exception:{err}')
return web.Response(status=200, text="I'm fine")
async def webserver(runner, addr, port):
await runner.setup()
site = web.TCPSite(runner, addr, port)
await site.start()
logging.info(f'HTTP server listen on port: {port}')
while True:
await asyncio.sleep(3600) # sleep forever
async def handle_client(reader: StreamReader, writer: StreamWriter):
'''Handles a new incoming connection and starts an async loop'''
@@ -26,10 +72,12 @@ async def handle_client_v2(reader: StreamReader, writer: StreamWriter):
await InverterG3P(reader, writer, addr).server_loop(addr)
async def handle_shutdown(loop):
async def handle_shutdown(loop, runner):
'''Close all TCP connections and stop the event loop'''
logging.info('Shutdown due to SIGTERM')
await runner.cleanup()
logging.info('HTTP server stopped')
#
# first, disc all open TCP connections gracefully
@@ -87,15 +135,23 @@ if __name__ == "__main__":
logging.getLogger('tracer').setLevel(log_level)
# logging.getLogger('mqtt').setLevel(log_level)
# read config file
Config.class_init()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# read config file
ConfigErr = Config.class_init()
if ConfigErr is not None:
logging.info(f'ConfigErr: {ConfigErr}')
Inverter.class_init()
Schedule.start()
#
# Setup webserver application and runner
#
app = web.Application()
app.add_routes(routes)
runner = web.AppRunner(app)
#
# Register some UNIX Signal handler for a gracefully server shutdown
# on Docker restart and stop
@@ -103,7 +159,7 @@ if __name__ == "__main__":
for signame in ('SIGINT', 'SIGTERM'):
loop.add_signal_handler(getattr(signal, signame),
lambda loop=loop: asyncio.create_task(
handle_shutdown(loop)))
handle_shutdown(loop, runner)))
#
# Create tasks for our listening servers. These must be tasks! If we call
@@ -112,12 +168,16 @@ if __name__ == "__main__":
#
loop.create_task(asyncio.start_server(handle_client, '0.0.0.0', 5005))
loop.create_task(asyncio.start_server(handle_client_v2, '0.0.0.0', 10000))
loop.create_task(webserver(runner, '0.0.0.0', 8127))
try:
if ConfigErr is None:
proxy_is_up = True
loop.run_forever()
except KeyboardInterrupt:
pass
finally:
proxy_is_up = False
Inverter.class_close(loop)
logging.info('Close event loop')
loop.close()

View File

@@ -1682,21 +1682,21 @@ def test_zombie_conn(ConfigTsunInv1, MsgInverterInd):
m1 = MemoryStream(MsgInverterInd, (0,))
m2 = MemoryStream(MsgInverterInd, (0,))
m3 = MemoryStream(MsgInverterInd, (0,))
assert m1.state == m1.STATE_INIT
assert m2.state == m2.STATE_INIT
assert m3.state == m3.STATE_INIT
assert m1.state == m1.State.init
assert m2.state == m2.State.init
assert m3.state == m3.State.init
m1.read() # read complete msg, and set unique_id
assert m1.state == m1.STATE_INIT
assert m2.state == m2.STATE_INIT
assert m3.state == m3.STATE_INIT
assert m1.state == m1.State.init
assert m2.state == m2.State.init
assert m3.state == m3.State.init
m2.read() # read complete msg, and set unique_id
assert m1.state == m1.STATE_CLOSED
assert m2.state == m2.STATE_INIT
assert m3.state == m3.STATE_INIT
assert m1.state == m1.State.closed
assert m2.state == m2.State.init
assert m3.state == m3.State.init
m3.read() # read complete msg, and set unique_id
assert m1.state == m1.STATE_CLOSED
assert m2.state == m2.STATE_CLOSED
assert m3.state == m3.STATE_INIT
assert m1.state == m1.State.closed
assert m2.state == m2.State.closed
assert m3.state == m3.State.init
m1.close()
m2.close()
m3.close()

View File

@@ -4,6 +4,7 @@ from app.src.gen3.talent import Talent, Control
from app.src.config import Config
from app.src.infos import Infos, Register
from app.src.modbus import Modbus
from app.src.messages import State
pytest_plugins = ('pytest_asyncio',)
@@ -909,7 +910,7 @@ def test_msg_modbus_req(ConfigTsunInv1, MsgModbusCmd):
ConfigTsunInv1
m = MemoryStream(b'')
m.id_str = b"R170000000000001"
m.state = m.STATE_UP
m.state = State.up
c = m.createClientStream(MsgModbusCmd)
@@ -1159,7 +1160,7 @@ async def test_msg_build_modbus_req(ConfigTsunInv1, MsgModbusCmd):
assert m._send_buffer == b''
assert m.writer.sent_pdu == b''
m.state = m.STATE_UP
m.state = State.up
await m.send_modbus_cmd(Modbus.WRITE_SINGLE_REG, 0x2008, 0, logging.DEBUG)
assert 0 == m.send_msg_ofs
assert m._forward_buffer == b''
@@ -1189,21 +1190,21 @@ def test_zombie_conn(ConfigTsunInv1, MsgInverterInd):
m3 = MemoryStream(MsgInverterInd, (0,))
assert MemoryStream._RefNo == 3 + start_val
assert m3.RefNo == 3 + start_val
assert m1.state == m1.STATE_INIT
assert m2.state == m2.STATE_INIT
assert m3.state == m3.STATE_INIT
assert m1.state == m1.State.init
assert m2.state == m2.State.init
assert m3.state == m3.State.init
m1.read() # read complete msg, and set unique_id
assert m1.state == m1.STATE_UP
assert m2.state == m2.STATE_INIT
assert m3.state == m3.STATE_INIT
assert m1.state == m1.State.up
assert m2.state == m2.State.init
assert m3.state == m3.State.init
m2.read() # read complete msg, and set unique_id
assert m1.state == m1.STATE_CLOSED
assert m2.state == m2.STATE_UP
assert m3.state == m3.STATE_INIT
assert m1.state == m1.State.closed
assert m2.state == m2.State.up
assert m3.state == m3.State.init
m3.read() # read complete msg, and set unique_id
assert m1.state == m1.STATE_CLOSED
assert m2.state == m2.STATE_CLOSED
assert m3.state == m3.STATE_UP
assert m1.state == m1.State.closed
assert m2.state == m2.State.closed
assert m3.state == m3.State.up
m1.close()
m2.close()
m3.close()

View File

@@ -77,10 +77,15 @@ services:
- ${DNS2:-4.4.4.4}
ports:
- 5005:5005
- 8127:8127
- 10000:10000
volumes:
- ${PROJECT_DIR:-./}tsun-proxy/log:/home/tsun-proxy/log
- ${PROJECT_DIR:-./}tsun-proxy/config:/home/tsun-proxy/config
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:8127/-/healthy || exit 1
interval: 10s
timeout: 3s
networks:
- outside