diff --git a/CHANGELOG.md b/CHANGELOG.md index 12fec4b..2f9ef4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [unreleased] +- Add healthcheck, readiness and liveness checks [#91](https://github.com/s-allius/tsun-gen3-proxy/issues/91) - MODBUS close handler releases internal resource [#93](https://github.com/s-allius/tsun-gen3-proxy/issues/93) - add exception handling for message forwarding [#94](https://github.com/s-allius/tsun-gen3-proxy/issues/94) - GEN3: make timestamp handling stateless, to avoid blocking when the TSUN cloud is down [#56](https://github.com/s-allius/tsun-gen3-proxy/issues/56) @@ -29,7 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - catch all OSError errors in the read loop - log Modbus traces with different log levels - add Modbus fifo and timeout handler -- build version string in the same format as TSUN for GEN3 invterts +- build version string in the same format as TSUN for GEN3 inverters - add graceful shutdown - parse Modbus values and store them in the database - add cron task to request the output power every minute diff --git a/app/Dockerfile b/app/Dockerfile index 0ef685e..8b2d706 100644 --- a/app/Dockerfile +++ b/app/Dockerfile @@ -64,7 +64,7 @@ COPY --chmod=0700 entrypoint.sh /root/entrypoint.sh COPY config . COPY src . RUN date > /build-date.txt -EXPOSE 5005 +EXPOSE 5005 8127 10000 # command to run on container start ENTRYPOINT ["/root/entrypoint.sh"] diff --git a/app/hardening_final.sh b/app/hardening_final.sh index c6896bb..279e1b6 100644 --- a/app/hardening_final.sh +++ b/app/hardening_final.sh @@ -17,6 +17,5 @@ if [ "$environment" = "production" ] ; then \ -name od -o \ -name strings -o \ -name su -o \ - -name wget -o \ \) -delete \ ; fi diff --git a/app/requirements.txt b/app/requirements.txt index b151101..ed9dcb0 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,3 +1,4 @@ aiomqtt==2.0.1 schema==0.7.5 - aiocron==1.8 \ No newline at end of file + aiocron==1.8 + aiohttp==3.9.5 \ No newline at end of file diff --git a/app/src/async_stream.py b/app/src/async_stream.py index 3deeb43..1b55ade 100644 --- a/app/src/async_stream.py +++ b/app/src/async_stream.py @@ -1,9 +1,11 @@ import logging import traceback +import time from asyncio import StreamReader, StreamWriter -from messages import hex_dump_memory +from messages import hex_dump_memory, State from typing import Self +import gc logger = logging.getLogger('conn') @@ -17,6 +19,8 @@ class AsyncStream(): self.addr = addr self.r_addr = '' self.l_addr = '' + self.proc_start = None # start processing start timestamp + self.proc_max = 0 async def server_loop(self, addr: str) -> None: '''Loop for receiving messages from the inverter (server-side)''' @@ -61,8 +65,14 @@ class AsyncStream(): """Async loop handler for precessing all received messages""" self.r_addr = self.writer.get_extra_info('peername') self.l_addr = self.writer.get_extra_info('sockname') + self.proc_start = time.time() while True: try: + proc = time.time() - self.proc_start + if proc > self.proc_max: + self.proc_max = proc + self.proc_start = None + await self.__async_read() if self.unique_id: @@ -117,6 +127,17 @@ class AsyncStream(): logger.debug(f'AsyncStream.close() l{self.l_addr} | r{self.r_addr}') self.writer.close() + def healthy(self) -> bool: + elapsed = 0 + if self.proc_start is not None: + elapsed = time.time() - self.proc_start + if self.state == State.closed or elapsed > 1: + logging.debug(f'[{self.node_id}:{type(self).__name__}]' + f' act:{round(1000*elapsed)}ms' + f' max:{round(1000*self.proc_max)}ms') + logging.debug(f'Healthy()) refs: {gc.get_referrers(self)}') + return elapsed < 5 + ''' Our private methods ''' @@ -124,6 +145,7 @@ class AsyncStream(): """Async read handler to read received data from TCP stream""" data = await self.reader.read(4096) if data: + self.proc_start = time.time() self._recv_buffer += data self.read() # call read in parent class else: diff --git a/app/src/gen3/connection_g3.py b/app/src/gen3/connection_g3.py index 7730069..7e7b96d 100644 --- a/app/src/gen3/connection_g3.py +++ b/app/src/gen3/connection_g3.py @@ -31,6 +31,10 @@ class ConnectionG3(AsyncStream, Talent): async def async_publ_mqtt(self) -> None: pass + def healthy(self) -> bool: + logger.debug('ConnectionG3 healthy()') + return AsyncStream.healthy(self) + ''' Our private methods ''' diff --git a/app/src/gen3/talent.py b/app/src/gen3/talent.py index 57ac794..532e56b 100644 --- a/app/src/gen3/talent.py +++ b/app/src/gen3/talent.py @@ -4,12 +4,12 @@ import time from datetime import datetime if __name__ == "app.src.gen3.talent": - from app.src.messages import hex_dump_memory, Message + from app.src.messages import hex_dump_memory, Message, State from app.src.modbus import Modbus from app.src.config import Config from app.src.gen3.infos_g3 import InfosG3 else: # pragma: no cover - from messages import hex_dump_memory, Message + from messages import hex_dump_memory, Message, State from modbus import Modbus from config import Config from gen3.infos_g3 import InfosG3 @@ -77,7 +77,7 @@ class Talent(Message): # deallocated by the garbage collector ==> we get a memory leak self.switch.clear() self.log_lvl.clear() - self.state = self.STATE_CLOSED + self.state = State.closed super().close() def __set_serial_no(self, serial_no: str): @@ -114,6 +114,9 @@ class Talent(Message): if self.header_valid and len(self._recv_buffer) >= (self.header_len + self.data_len): + if self.state == State.init: + self.state = State.received + log_lvl = self.log_lvl.get(self.msg_id, logging.WARNING) if callable(log_lvl): log_lvl = log_lvl() @@ -141,7 +144,7 @@ class Talent(Message): return def send_modbus_cb(self, modbus_pdu: bytearray, log_lvl: int, state: str): - if self.state != self.STATE_UP: + if self.state != State.up: logger.warning(f'[{self.node_id}] ignore MODBUS cmd,' ' cause the state is not UP anymore') return @@ -158,7 +161,7 @@ class Talent(Message): self._send_buffer = bytearray(0) # self._send_buffer[sent:] async def send_modbus_cmd(self, func, addr, val, log_lvl) -> None: - if self.state != self.STATE_UP: + if self.state != State.up: logger.log(log_lvl, f'[{self.node_id}] ignore MODBUS cmd,' ' as the state is not UP') return @@ -371,7 +374,7 @@ class Talent(Message): self._send_buffer += b'\x01' self.__finish_send_msg() self.__process_data() - self.state = self.STATE_UP + self.state = State.up elif self.ctrl.is_resp(): return # ignore received response @@ -387,7 +390,7 @@ class Talent(Message): self._send_buffer += b'\x01' self.__finish_send_msg() self.__process_data() - self.state = self.STATE_UP + self.state = State.up elif self.ctrl.is_resp(): return # ignore received response diff --git a/app/src/gen3plus/connection_g3p.py b/app/src/gen3plus/connection_g3p.py index ecc5625..352ba5e 100644 --- a/app/src/gen3plus/connection_g3p.py +++ b/app/src/gen3plus/connection_g3p.py @@ -31,6 +31,10 @@ class ConnectionG3P(AsyncStream, SolarmanV5): async def async_publ_mqtt(self) -> None: pass + def healthy(self) -> bool: + logger.debug('ConnectionG3P healthy()') + return AsyncStream.healthy(self) + ''' Our private methods ''' diff --git a/app/src/gen3plus/solarman_v5.py b/app/src/gen3plus/solarman_v5.py index dd209b0..d65e560 100644 --- a/app/src/gen3plus/solarman_v5.py +++ b/app/src/gen3plus/solarman_v5.py @@ -6,13 +6,13 @@ import asyncio from datetime import datetime if __name__ == "app.src.gen3plus.solarman_v5": - from app.src.messages import hex_dump_memory, Message + from app.src.messages import hex_dump_memory, Message, State from app.src.modbus import Modbus from app.src.config import Config from app.src.gen3plus.infos_g3p import InfosG3P from app.src.infos import Register else: # pragma: no cover - from messages import hex_dump_memory, Message + from messages import hex_dump_memory, Message, State from config import Config from modbus import Modbus from gen3plus.infos_g3p import InfosG3P @@ -135,7 +135,7 @@ class SolarmanV5(Message): # deallocated by the garbage collector ==> we get a memory leak self.switch.clear() self.log_lvl.clear() - self.state = self.STATE_CLOSED + self.state = State.closed super().close() def __set_serial_no(self, snr: int): @@ -184,6 +184,9 @@ class SolarmanV5(Message): self._recv_buffer, self.header_len+self.data_len+2) if self.__trailer_is_ok(self._recv_buffer, self.header_len + self.data_len + 2): + if self.state == State.init: + self.state = State.received + self.__set_serial_no(self.snr) self.__dispatch_msg() self.__flush_recv_msg() @@ -345,7 +348,7 @@ class SolarmanV5(Message): self.__finish_send_msg() def send_modbus_cb(self, pdu: bytearray, log_lvl: int, state: str): - if self.state != self.STATE_UP: + if self.state != State.up: logger.warning(f'[{self.node_id}] ignore MODBUS cmd,' ' cause the state is not UP anymore') return @@ -360,7 +363,7 @@ class SolarmanV5(Message): self._send_buffer = bytearray(0) # self._send_buffer[sent:] async def send_modbus_cmd(self, func, addr, val, log_lvl) -> None: - if self.state != self.STATE_UP: + if self.state != State.up: logger.log(log_lvl, f'[{self.node_id}] ignore MODBUS cmd,' ' as the state is not UP') return @@ -371,7 +374,7 @@ class SolarmanV5(Message): cmd.startswith(tuple(self.at_acl[connection]['block'])) async def send_at_cmd(self, AT_cmd: str) -> None: - if self.state != self.STATE_UP: + if self.state != State.up: logger.warning(f'[{self.node_id}] ignore AT+ cmd,' ' as the state is not UP') return @@ -471,7 +474,7 @@ class SolarmanV5(Message): self.__process_data(ftype) self.__forward_msg() self.__send_ack_rsp(0x1210, ftype) - self.state = self.STATE_UP + self.state = State.up def msg_sync_start(self): data = self._recv_buffer[self.header_len:] @@ -567,7 +570,7 @@ class SolarmanV5(Message): self.__forward_msg() self.__send_ack_rsp(0x1710, ftype) - self.state = self.STATE_UP + self.state = State.up def msg_sync_end(self): data = self._recv_buffer[self.header_len:] diff --git a/app/src/messages.py b/app/src/messages.py index 6553f36..35e31db 100644 --- a/app/src/messages.py +++ b/app/src/messages.py @@ -1,6 +1,7 @@ import logging import weakref from typing import Callable, Generator +from enum import Enum if __name__ == "app.src.messages": @@ -52,11 +53,20 @@ class IterRegistry(type): yield obj +class State(Enum): + '''state of the logical connection''' + init = 0 + '''just created''' + received = 1 + '''at least one packet received''' + up = 2 + '''at least one cmd-rsp transaction''' + closed = 3 + '''connection closed''' + + class Message(metaclass=IterRegistry): _registry = [] - STATE_INIT = 0 - STATE_UP = 2 - STATE_CLOSED = 3 def __init__(self, server_side: bool, send_modbus_cb: Callable[[bytes, int, str], None], mb_timeout: int): @@ -78,7 +88,7 @@ class Message(metaclass=IterRegistry): self._send_buffer = bytearray(0) self._forward_buffer = bytearray(0) self.new_data = {} - self.state = self.STATE_INIT + self.state = State.init ''' Empty methods, that have to be implemented in any child class which diff --git a/app/src/server.py b/app/src/server.py index e941975..183ada0 100644 --- a/app/src/server.py +++ b/app/src/server.py @@ -3,6 +3,7 @@ import asyncio import signal import os from asyncio import StreamReader, StreamWriter +from aiohttp import web from logging import config # noqa F401 from messages import Message from inverter import Inverter @@ -11,6 +12,51 @@ from gen3plus.inverter_g3p import InverterG3P from scheduler import Schedule from config import Config +routes = web.RouteTableDef() +proxy_is_up = False + + +@routes.get('/') +async def hello(request): + return web.Response(text="Hello, world") + + +@routes.get('/-/ready') +async def ready(request): + if proxy_is_up: + status = 200 + text = 'Is ready' + else: + status = 503 + text = 'Not ready' + return web.Response(status=status, text=text) + + +@routes.get('/-/healthy') +async def healthy(request): + + if proxy_is_up: + # logging.info('web reqeust healthy()') + for stream in Message: + try: + res = stream.healthy() + if not res: + return web.Response(status=503, text="I have a problem") + except Exception as err: + logging.info(f'Exception:{err}') + + return web.Response(status=200, text="I'm fine") + + +async def webserver(runner, addr, port): + await runner.setup() + site = web.TCPSite(runner, addr, port) + await site.start() + logging.info(f'HTTP server listen on port: {port}') + + while True: + await asyncio.sleep(3600) # sleep forever + async def handle_client(reader: StreamReader, writer: StreamWriter): '''Handles a new incoming connection and starts an async loop''' @@ -26,10 +72,12 @@ async def handle_client_v2(reader: StreamReader, writer: StreamWriter): await InverterG3P(reader, writer, addr).server_loop(addr) -async def handle_shutdown(loop): +async def handle_shutdown(loop, runner): '''Close all TCP connections and stop the event loop''' logging.info('Shutdown due to SIGTERM') + await runner.cleanup() + logging.info('HTTP server stopped') # # first, disc all open TCP connections gracefully @@ -87,15 +135,23 @@ if __name__ == "__main__": logging.getLogger('tracer').setLevel(log_level) # logging.getLogger('mqtt').setLevel(log_level) - # read config file - Config.class_init() - loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) + # read config file + ConfigErr = Config.class_init() + if ConfigErr is not None: + logging.info(f'ConfigErr: {ConfigErr}') Inverter.class_init() Schedule.start() + # + # Setup webserver application and runner + # + app = web.Application() + app.add_routes(routes) + runner = web.AppRunner(app) + # # Register some UNIX Signal handler for a gracefully server shutdown # on Docker restart and stop @@ -103,7 +159,7 @@ if __name__ == "__main__": for signame in ('SIGINT', 'SIGTERM'): loop.add_signal_handler(getattr(signal, signame), lambda loop=loop: asyncio.create_task( - handle_shutdown(loop))) + handle_shutdown(loop, runner))) # # Create tasks for our listening servers. These must be tasks! If we call @@ -112,12 +168,16 @@ if __name__ == "__main__": # loop.create_task(asyncio.start_server(handle_client, '0.0.0.0', 5005)) loop.create_task(asyncio.start_server(handle_client_v2, '0.0.0.0', 10000)) + loop.create_task(webserver(runner, '0.0.0.0', 8127)) try: + if ConfigErr is None: + proxy_is_up = True loop.run_forever() except KeyboardInterrupt: pass finally: + proxy_is_up = False Inverter.class_close(loop) logging.info('Close event loop') loop.close() diff --git a/app/tests/test_solarman.py b/app/tests/test_solarman.py index ef29998..dbbaa8d 100644 --- a/app/tests/test_solarman.py +++ b/app/tests/test_solarman.py @@ -1682,21 +1682,21 @@ def test_zombie_conn(ConfigTsunInv1, MsgInverterInd): m1 = MemoryStream(MsgInverterInd, (0,)) m2 = MemoryStream(MsgInverterInd, (0,)) m3 = MemoryStream(MsgInverterInd, (0,)) - assert m1.state == m1.STATE_INIT - assert m2.state == m2.STATE_INIT - assert m3.state == m3.STATE_INIT + assert m1.state == m1.State.init + assert m2.state == m2.State.init + assert m3.state == m3.State.init m1.read() # read complete msg, and set unique_id - assert m1.state == m1.STATE_INIT - assert m2.state == m2.STATE_INIT - assert m3.state == m3.STATE_INIT + assert m1.state == m1.State.init + assert m2.state == m2.State.init + assert m3.state == m3.State.init m2.read() # read complete msg, and set unique_id - assert m1.state == m1.STATE_CLOSED - assert m2.state == m2.STATE_INIT - assert m3.state == m3.STATE_INIT + assert m1.state == m1.State.closed + assert m2.state == m2.State.init + assert m3.state == m3.State.init m3.read() # read complete msg, and set unique_id - assert m1.state == m1.STATE_CLOSED - assert m2.state == m2.STATE_CLOSED - assert m3.state == m3.STATE_INIT + assert m1.state == m1.State.closed + assert m2.state == m2.State.closed + assert m3.state == m3.State.init m1.close() m2.close() m3.close() diff --git a/app/tests/test_talent.py b/app/tests/test_talent.py index 7632a09..6256811 100644 --- a/app/tests/test_talent.py +++ b/app/tests/test_talent.py @@ -4,6 +4,7 @@ from app.src.gen3.talent import Talent, Control from app.src.config import Config from app.src.infos import Infos, Register from app.src.modbus import Modbus +from app.src.messages import State pytest_plugins = ('pytest_asyncio',) @@ -909,7 +910,7 @@ def test_msg_modbus_req(ConfigTsunInv1, MsgModbusCmd): ConfigTsunInv1 m = MemoryStream(b'') m.id_str = b"R170000000000001" - m.state = m.STATE_UP + m.state = State.up c = m.createClientStream(MsgModbusCmd) @@ -1159,7 +1160,7 @@ async def test_msg_build_modbus_req(ConfigTsunInv1, MsgModbusCmd): assert m._send_buffer == b'' assert m.writer.sent_pdu == b'' - m.state = m.STATE_UP + m.state = State.up await m.send_modbus_cmd(Modbus.WRITE_SINGLE_REG, 0x2008, 0, logging.DEBUG) assert 0 == m.send_msg_ofs assert m._forward_buffer == b'' @@ -1189,21 +1190,21 @@ def test_zombie_conn(ConfigTsunInv1, MsgInverterInd): m3 = MemoryStream(MsgInverterInd, (0,)) assert MemoryStream._RefNo == 3 + start_val assert m3.RefNo == 3 + start_val - assert m1.state == m1.STATE_INIT - assert m2.state == m2.STATE_INIT - assert m3.state == m3.STATE_INIT + assert m1.state == m1.State.init + assert m2.state == m2.State.init + assert m3.state == m3.State.init m1.read() # read complete msg, and set unique_id - assert m1.state == m1.STATE_UP - assert m2.state == m2.STATE_INIT - assert m3.state == m3.STATE_INIT + assert m1.state == m1.State.up + assert m2.state == m2.State.init + assert m3.state == m3.State.init m2.read() # read complete msg, and set unique_id - assert m1.state == m1.STATE_CLOSED - assert m2.state == m2.STATE_UP - assert m3.state == m3.STATE_INIT + assert m1.state == m1.State.closed + assert m2.state == m2.State.up + assert m3.state == m3.State.init m3.read() # read complete msg, and set unique_id - assert m1.state == m1.STATE_CLOSED - assert m2.state == m2.STATE_CLOSED - assert m3.state == m3.STATE_UP + assert m1.state == m1.State.closed + assert m2.state == m2.State.closed + assert m3.state == m3.State.up m1.close() m2.close() m3.close() diff --git a/docker-compose.yaml b/docker-compose.yaml index 4566a80..9b4f9e8 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -77,10 +77,15 @@ services: - ${DNS2:-4.4.4.4} ports: - 5005:5005 + - 8127:8127 - 10000:10000 volumes: - ${PROJECT_DIR:-./}tsun-proxy/log:/home/tsun-proxy/log - ${PROJECT_DIR:-./}tsun-proxy/config:/home/tsun-proxy/config + healthcheck: + test: wget --no-verbose --tries=1 --spider http://localhost:8127/-/healthy || exit 1 + interval: 10s + timeout: 3s networks: - outside