From 7b20e71d6571b84406c298b5186a443aa671ebb3 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Tue, 9 Apr 2024 19:26:44 -0600 Subject: [PATCH] add vision --- .gitignore | 6 +-- README.md | 2 + config.sample.yaml | 4 ++ main.py | 4 +- matrix_gpt/api_client_manager.py | 13 +++--- matrix_gpt/callbacks.py | 13 +++--- matrix_gpt/chat_functions.py | 19 ++++++++- matrix_gpt/config.py | 11 +++++- matrix_gpt/generate.py | 17 ++++++-- matrix_gpt/generate_clients/anthropic.py | 44 +++++++++++++++++++-- matrix_gpt/generate_clients/api_client.py | 12 +++++- matrix_gpt/generate_clients/command_info.py | 3 +- matrix_gpt/generate_clients/openai.py | 22 ++++++++++- matrix_gpt/handle_actions.py | 16 +++++--- matrix_gpt/image.py | 23 +++++++++++ matrix_gpt/matrix_helper.py | 6 +-- requirements.txt | 5 +-- 17 files changed, 178 insertions(+), 42 deletions(-) create mode 100644 matrix_gpt/image.py diff --git a/.gitignore b/.gitignore index 4596390..63324db 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ .idea -*-store/ -*.yaml -!config.sample.yaml +bot-store/ +config.yaml +config.yml # ---> Python # Byte-compiled / optimized / DLL files diff --git a/README.md b/README.md index fd78789..a41e4c9 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ _Chatbots for Matrix._ This bot supports OpenAI, Anthropic, and locally hosted models that use an OpenAI-compatible endpoint. It can run multiple different models using different triggers, such as `!c4` for GPT4 and `!ca` for Anthropic, all through the same bot. +OpenAI and Anthropic vision is supported. +
## Install diff --git a/config.sample.yaml b/config.sample.yaml index c342ae6..98b5021 100644 --- a/config.sample.yaml +++ b/config.sample.yaml @@ -77,6 +77,10 @@ command: # Custom OpenAI endpoint. OpenAI only. # api_base: https://example.com/openai/v1 + # Enable vision on this model. + # Bot can only view images that are in threads. Threads cannot be started with images. + # vision: false + # Bot's description, shown when running `!matrixgpt`. # help: A helpful assistant. diff --git a/main.py b/main.py index 59767d6..a8965c3 100644 --- a/main.py +++ b/main.py @@ -11,7 +11,7 @@ from pathlib import Path from aiohttp import ClientConnectionError, ServerDisconnectedError from bison.errors import SchemeValidationError -from nio import InviteMemberEvent, JoinResponse, MegolmEvent, RoomMessageText, UnknownEvent +from nio import InviteMemberEvent, JoinResponse, MegolmEvent, RoomMessageText, UnknownEvent, RoomMessageImage from matrix_gpt import MatrixClientHelper from matrix_gpt.callbacks import MatrixBotCallbacks @@ -79,7 +79,7 @@ async def main(args): # Set up event callbacks callbacks = MatrixBotCallbacks(client=client_helper) - client.add_event_callback(callbacks.handle_message, RoomMessageText) + client.add_event_callback(callbacks.handle_message, (RoomMessageText, RoomMessageImage)) client.add_event_callback(callbacks.handle_invite, InviteMemberEvent) client.add_event_callback(callbacks.decryption_failure, MegolmEvent) client.add_event_callback(callbacks.unknown, UnknownEvent) diff --git a/matrix_gpt/api_client_manager.py b/matrix_gpt/api_client_manager.py index 2cfb825..19b8164 100644 --- a/matrix_gpt/api_client_manager.py +++ b/matrix_gpt/api_client_manager.py @@ -1,5 +1,6 @@ import logging +from matrix_gpt import MatrixClientHelper from matrix_gpt.config import global_config from matrix_gpt.generate_clients.anthropic import AnthropicApiClient from matrix_gpt.generate_clients.openai import OpenAIClient @@ -23,30 +24,32 @@ class ApiClientManager: self._openai_api_key = global_config['openai'].get('api_key', 'MatrixGPT') self._anth_api_key = global_config['anthropic'].get('api_key') - def get_client(self, mode: str): + def get_client(self, mode: str, client_helper: MatrixClientHelper): if mode == 'openai': - return self.openai_client() + return self.openai_client(client_helper) elif mode == 'anth': - return self.anth_client() + return self.anth_client(client_helper) else: raise Exception - def openai_client(self): + def openai_client(self, client_helper: MatrixClientHelper): self._set_from_config() if not self._openai_api_key: self.logger.error('Missing an OpenAI API key!') return None return OpenAIClient( api_key=self._openai_api_key, + client_helper=client_helper ) - def anth_client(self): + def anth_client(self, client_helper: MatrixClientHelper): self._set_from_config() if not self._anth_api_key: self.logger.error('Missing an Anthropic API key!') return None return AnthropicApiClient( api_key=self._anth_api_key, + client_helper=client_helper ) diff --git a/matrix_gpt/callbacks.py b/matrix_gpt/callbacks.py index d7039ee..de1f451 100644 --- a/matrix_gpt/callbacks.py +++ b/matrix_gpt/callbacks.py @@ -1,8 +1,9 @@ import asyncio import logging import time +from typing import Union -from nio import (AsyncClient, InviteMemberEvent, MatrixRoom, MegolmEvent, RoomMessageText, UnknownEvent) +from nio import (AsyncClient, InviteMemberEvent, MatrixRoom, MegolmEvent, RoomMessageText, UnknownEvent, RoomMessageImage) from .chat_functions import check_authorized, is_thread, check_command_prefix from .config import global_config @@ -18,12 +19,11 @@ class MatrixBotCallbacks: self.startup_ts = time.time() * 1000 self.seen_messages = set() - async def handle_message(self, room: MatrixRoom, requestor_event: RoomMessageText) -> None: + async def handle_message(self, room: MatrixRoom, requestor_event: Union[RoomMessageText, RoomMessageImage]) -> None: """ Callback for when a message event is received. """ - # Mark all messages as read. - mark_read_task = asyncio.create_task(self.client.room_read_markers(room.room_id, requestor_event.event_id, requestor_event.event_id)) + mark_read_task = asyncio.create_task(self.client.room_read_markers(room.room_id, requestor_event.event_id, requestor_event.event_id)) # Mark all messages as read. msg = requestor_event.body.strip().strip('\n') if msg == "** Unable to decrypt: The sender's device has not sent us the keys for this message. **": self.logger.debug(f'Unable to decrypt event "{requestor_event.event_id} in room {room.room_id}') @@ -37,6 +37,7 @@ class MatrixBotCallbacks: await sound_off(room, requestor_event, self.client_helper) return if requestor_event.event_id in self.seen_messages: + # Need to track messages manually because the sync background thread may trigger the callback. return self.seen_messages.add(requestor_event.event_id) command_activated, sent_command_prefix, command_info = check_command_prefix(msg) @@ -46,8 +47,8 @@ class MatrixBotCallbacks: self.logger.debug(f'Message from {requestor_event.sender} in {room.room_id} --> "{msg}"') # Start the task in the background and don't wait for it here or else we'll block everything. task = asyncio.create_task(do_reply_threaded_msg(self.client_helper, room, requestor_event)) - elif command_activated and not is_thread(requestor_event): - # Everything else + elif isinstance(requestor_event, RoomMessageText) and command_activated and not is_thread(requestor_event): + # Everything else. Images do not start threads. self.logger.debug(f'Message from {requestor_event.sender} in {room.room_id} --> "{msg}"') allowed_to_chat = command_info.allowed_to_chat + global_config['allowed_to_chat'] if not check_authorized(requestor_event.sender, allowed_to_chat): diff --git a/matrix_gpt/chat_functions.py b/matrix_gpt/chat_functions.py index e2c0ace..9c0eeb0 100644 --- a/matrix_gpt/chat_functions.py +++ b/matrix_gpt/chat_functions.py @@ -1,5 +1,6 @@ import logging from typing import List, Tuple +from urllib.parse import urlparse from nio import AsyncClient, Event, MatrixRoom, RoomGetEventResponse, RoomMessageText @@ -36,19 +37,26 @@ async def is_this_our_thread(client: AsyncClient, room: MatrixRoom, event: RoomM async def get_thread_content(client: AsyncClient, room: MatrixRoom, base_event: RoomMessageText) -> List[Event]: messages = [] + + # This is the event of the message that was just sent. new_event = (await client.room_get_event(room.room_id, base_event.event_id)).event + while True: if new_event.source['content'].get('m.relates_to', {}).get('rel_type') == 'm.thread': + # Put the event in the messages list only if it's related to the thread we're parsing. messages.append(new_event) else: break + # Fetch the next event. new_event = (await client.room_get_event( room.room_id, new_event.source['content']['m.relates_to']['m.in_reply_to']['event_id']) ).event + + # Put the root event in the array. messages.append((await client.room_get_event( room.room_id, base_event.source['content']['m.relates_to']['event_id']) - ).event) # put the root event in the array + ).event) messages.reverse() return messages @@ -77,3 +85,12 @@ def check_authorized(string, to_check): return output else: raise Exception + + +async def download_mxc(url: str, client: AsyncClient) -> bytes: + mxc = urlparse(url) + response = await client.download(mxc.netloc, mxc.path.strip("/")) + if hasattr(response, "body"): + return response.body + else: + return b'' diff --git a/matrix_gpt/config.py b/matrix_gpt/config.py index 8e5f585..dc8b52e 100644 --- a/matrix_gpt/config.py +++ b/matrix_gpt/config.py @@ -31,6 +31,7 @@ config_scheme = bison.Scheme( bison.Option('system_prompt', field_type=str, default=None), bison.Option('injected_system_prompt', field_type=str, default=None), bison.Option('api_base', field_type=[str, NoneType], default=None), + bison.Option('vision', field_type=bool, default=False), bison.Option('help', field_type=[str, NoneType], default=None), )), bison.DictOption('openai', scheme=bison.Scheme( @@ -83,7 +84,15 @@ class ConfigManager: if not self._config.config['openai']['api_key'] and not self._config.config['anthropic']['api_key']: raise SchemeValidationError('You need an OpenAI or Anthropic API key') self._parsed_config = self._merge_in_list_defaults() - # TODO: make sure there aren't duplicate triggers + + # Make sure there aren't duplicate triggers + existing_triggers = [] + for item in self._config.config['command']: + trigger = item['trigger'] + if trigger in existing_triggers: + raise SchemeValidationError(f'Duplicate trigger {trigger}') + existing_triggers.append(trigger) + self._command_prefixes = self._generate_command_prefixes() def _merge_in_list_defaults(self): diff --git a/matrix_gpt/generate.py b/matrix_gpt/generate.py index e7f95ce..fa3289a 100644 --- a/matrix_gpt/generate.py +++ b/matrix_gpt/generate.py @@ -30,7 +30,7 @@ async def generate_ai_response( try: await client.room_typing(room.room_id, typing_state=True, timeout=global_config['response_timeout'] * 1000) - api_client = api_client_helper.get_client(command_info.api_type) + api_client = api_client_helper.get_client(command_info.api_type, client_helper) messages = api_client.assemble_context(msg, system_prompt=command_info.system_prompt, injected_system_prompt=command_info.injected_system_prompt) response = None @@ -77,9 +77,18 @@ async def generate_ai_response( # Logging if global_config['logging']['log_full_response']: - logger.debug( - {'event_id': event.event_id, 'room': room.room_id, 'messages': messages, 'response': response} - ) + data = {'event_id': event.event_id, 'room': room.room_id, 'messages': messages, 'response': response} + # Remove images from the logged data. + for i in range(len(data['messages'])): + if isinstance(data['messages'][i]['content'], list): + # Images are always sent as lists + if data['messages'][i]['content'][0].get('source', {}).get('media_type'): + # Anthropic + data['messages'][i]['content'][0]['source']['data'] = '...' + elif data['messages'][i]['content'][0].get('image_url'): + # OpenAI + data['messages'][i]['content'][0]['image_url']['url'] = '...' + logger.debug(data) z = text_response.replace("\n", "\\n") logger.info(f'Reply to {event.event_id} --> {command_info.model} responded with "{z}"') diff --git a/matrix_gpt/generate_clients/anthropic.py b/matrix_gpt/generate_clients/anthropic.py index e06d3c2..62adecc 100644 --- a/matrix_gpt/generate_clients/anthropic.py +++ b/matrix_gpt/generate_clients/anthropic.py @@ -1,14 +1,17 @@ from typing import Union from anthropic import AsyncAnthropic +from nio import RoomMessageImage +from matrix_gpt.chat_functions import download_mxc from matrix_gpt.generate_clients.api_client import ApiClient from matrix_gpt.generate_clients.command_info import CommandInfo +from matrix_gpt.image import process_image class AnthropicApiClient(ApiClient): - def __init__(self, api_key: str): - super().__init__(api_key) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def _create_client(self, base_url: str = None): return AsyncAnthropic( @@ -23,11 +26,46 @@ class AnthropicApiClient(ApiClient): self._context = messages return messages + def verify_context(self): + """ + Verify that the context alternates between the human and assistant, inserting the opposite + user type if it does not alternate correctly. + """ + i = 0 + while i < len(self._context) - 1: + if self._context[i]['role'] == self._context[i + 1]['role']: + dummy = self.generate_text_msg(f'<{self._BOT_NAME} did not respond>', self._BOT_NAME) if self._context[i]['role'] == self._HUMAN_NAME else self.generate_text_msg(f'<{self._HUMAN_NAME} did not respond>', self._HUMAN_NAME) + self._context.insert(i + 1, dummy) + i += 1 + # if self._context[-1]['role'] == self._HUMAN_NAME: + # self._context.append(self.generate_text_msg(f'<{self._BOT_NAME} did not respond>', self._BOT_NAME)) + + def generate_text_msg(self, content: str, role: str): + assert role in [self._HUMAN_NAME, self._BOT_NAME] + return {"role": role, "content": [{"type": "text", "text": str(content)}]} + def append_msg(self, content: str, role: str): assert role in [self._HUMAN_NAME, self._BOT_NAME] - self._context.append({"role": role, "content": [{"type": "text", "text": str(content)}]}) + self._context.append(self.generate_text_msg(content, role)) + + async def append_img(self, img_event: RoomMessageImage, role: str): + assert role in [self._HUMAN_NAME, self._BOT_NAME] + img_bytes = await download_mxc(img_event.url, self.client_helper.client) + encoded_image = process_image(img_bytes, resize_px=784) + self._context.append({ + "role": role, + 'content': [{ + 'type': 'image', + 'source': { + 'type': 'base64', + 'media_type': 'image/png', + 'data': encoded_image + } + }] + }) async def generate(self, command_info: CommandInfo): + self.verify_context() r = await self._create_client().messages.create( model=command_info.model, max_tokens=None if command_info.max_tokens == 0 else command_info.max_tokens, diff --git a/matrix_gpt/generate_clients/api_client.py b/matrix_gpt/generate_clients/api_client.py index 1798186..37d4f1c 100644 --- a/matrix_gpt/generate_clients/api_client.py +++ b/matrix_gpt/generate_clients/api_client.py @@ -1,5 +1,8 @@ from typing import Union +from nio import RoomMessageImage + +from matrix_gpt import MatrixClientHelper from matrix_gpt.generate_clients.command_info import CommandInfo @@ -7,8 +10,9 @@ class ApiClient: _HUMAN_NAME = 'user' _BOT_NAME = 'assistant' - def __init__(self, api_key: str): + def __init__(self, api_key: str, client_helper: MatrixClientHelper): self.api_key = api_key + self.client_helper = client_helper self._context = [] def _create_client(self, base_url: str = None): @@ -17,9 +21,15 @@ class ApiClient: def assemble_context(self, messages: Union[str, list], system_prompt: str = None, injected_system_prompt: str = None): raise NotImplementedError + def generate_text_msg(self, content: str, role: str): + raise NotImplementedError + def append_msg(self, content: str, role: str): raise NotImplementedError + async def append_img(self, img_event: RoomMessageImage, role: str): + raise NotImplementedError + async def generate(self, command_info: CommandInfo): raise NotImplementedError diff --git a/matrix_gpt/generate_clients/command_info.py b/matrix_gpt/generate_clients/command_info.py index d8b5212..6064120 100644 --- a/matrix_gpt/generate_clients/command_info.py +++ b/matrix_gpt/generate_clients/command_info.py @@ -2,7 +2,7 @@ from matrix_gpt.config import global_config class CommandInfo: - def __init__(self, trigger: str, api_type: str, model: str, max_tokens: int, temperature: float, allowed_to_chat: list, allowed_to_thread: list, allowed_to_invite: list, system_prompt: str, injected_system_prompt: str, api_base: str = None, help: str = None): + def __init__(self, trigger: str, api_type: str, model: str, max_tokens: int, temperature: float, allowed_to_chat: list, allowed_to_thread: list, allowed_to_invite: list, system_prompt: str, injected_system_prompt: str, api_base: str = None, vision: bool = False, help: str = None): self.trigger = trigger assert api_type in ['openai', 'anth'] self.api_type = api_type @@ -12,6 +12,7 @@ class CommandInfo: self.system_prompt = system_prompt self.injected_system_prompt = injected_system_prompt self.api_base = api_base + self.vision = vision self.help = help self.allowed_to_chat = allowed_to_chat diff --git a/matrix_gpt/generate_clients/openai.py b/matrix_gpt/generate_clients/openai.py index 140c3b9..d3c2047 100644 --- a/matrix_gpt/generate_clients/openai.py +++ b/matrix_gpt/generate_clients/openai.py @@ -1,15 +1,18 @@ from typing import Union +from nio import RoomMessageImage from openai import AsyncOpenAI +from matrix_gpt.chat_functions import download_mxc from matrix_gpt.config import global_config from matrix_gpt.generate_clients.api_client import ApiClient from matrix_gpt.generate_clients.command_info import CommandInfo +from matrix_gpt.image import process_image class OpenAIClient(ApiClient): - def __init__(self, api_key: str): - super().__init__(api_key) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def _create_client(self, api_base: str = None): return AsyncOpenAI( @@ -21,6 +24,21 @@ class OpenAIClient(ApiClient): assert role in [self._HUMAN_NAME, self._BOT_NAME] self._context.append({'role': role, 'content': content}) + async def append_img(self, img_event: RoomMessageImage, role: str): + assert role in [self._HUMAN_NAME, self._BOT_NAME] + img_bytes = await download_mxc(img_event.url, self.client_helper.client) + encoded_image = process_image(img_bytes, resize_px=512) + self._context.append({ + "role": role, + 'content': [{ + 'type': 'image_url', + 'image_url': { + 'url': f"data:image/png;base64,{encoded_image}", + 'detail': 'low' + } + }] + }) + def assemble_context(self, messages: Union[str, list], system_prompt: str = None, injected_system_prompt: str = None): if isinstance(messages, list): messages = messages diff --git a/matrix_gpt/handle_actions.py b/matrix_gpt/handle_actions.py index a420cc8..6828490 100644 --- a/matrix_gpt/handle_actions.py +++ b/matrix_gpt/handle_actions.py @@ -50,7 +50,7 @@ async def do_reply_threaded_msg(client_helper: MatrixClientHelper, room: MatrixR await client.room_typing(room.room_id, typing_state=True, timeout=30000) thread_content = await get_thread_content(client, room, requestor_event) - api_client = api_client_helper.get_client(command_info.api_type) + api_client = api_client_helper.get_client(command_info.api_type, client_helper) for event in thread_content: if isinstance(event, MegolmEvent): await client_helper.send_text_to_room( @@ -64,11 +64,15 @@ async def do_reply_threaded_msg(client_helper: MatrixClientHelper, room: MatrixR await client.room_typing(room.room_id, typing_state=False, timeout=1000) return else: - thread_msg = event.body.strip().strip('\n') - api_client.append_msg( - role=api_client.BOT_NAME if event.sender == client.user_id else api_client.HUMAN_NAME, - content=thread_msg if not check_command_prefix(thread_msg)[0] else thread_msg[len(sent_command_prefix):].strip() - ) + role = api_client.BOT_NAME if event.sender == client.user_id else api_client.HUMAN_NAME + if isinstance(event, RoomMessageText): + thread_msg = event.body.strip().strip('\n') + api_client.append_msg( + role=role, + content=thread_msg if not check_command_prefix(thread_msg)[0] else thread_msg[len(sent_command_prefix):].strip(), + ) + elif command_info.vision: + await api_client.append_img(event, role) await generate_ai_response( client_helper=client_helper, diff --git a/matrix_gpt/image.py b/matrix_gpt/image.py new file mode 100644 index 0000000..4796982 --- /dev/null +++ b/matrix_gpt/image.py @@ -0,0 +1,23 @@ +import base64 +import io + +from PIL import Image + + +def process_image(source_bytes: bytes, resize_px: int): + image = Image.open(io.BytesIO(source_bytes)) + width, height = image.size + + if min(width, height) > resize_px: + if width < height: + new_width = resize_px + new_height = int((height / width) * new_width) + else: + new_height = resize_px + new_width = int((width / height) * new_height) + image = image.resize((new_width, new_height)) + + byte_arr = io.BytesIO() + image.save(byte_arr, format='PNG') + image_bytes = byte_arr.getvalue() + return base64.b64encode(image_bytes).decode('utf-8') diff --git a/matrix_gpt/matrix_helper.py b/matrix_gpt/matrix_helper.py index eb46cfc..df0795f 100644 --- a/matrix_gpt/matrix_helper.py +++ b/matrix_gpt/matrix_helper.py @@ -29,7 +29,6 @@ class MatrixClientHelper: self.store_path = Path(store_path).absolute().expanduser().resolve() self.store_path.mkdir(parents=True, exist_ok=True) self.auth_file = self.store_path / (device_id.lower() + '.json') - self.device_name = device_id self.client: AsyncClient = AsyncClient(homeserver=self.homeserver, user=self.user_id, config=self.client_config, device_id=device_id) self.logger = logging.getLogger('MatrixGPT').getChild('MatrixClientHelper') @@ -85,8 +84,8 @@ class MatrixClientHelper: def _read_details_from_disk(self): if not self.auth_file.exists(): - return {} - with open(self.auth_file, "r") as f: + return {'auth': {}, 'extra': {}} + with open(self.auth_file, 'r') as f: return json.load(f) def _write_details_to_disk(self, resp: LoginResponse = None, extra_data: dict = None) -> None: @@ -165,7 +164,6 @@ class MatrixClientHelper: } } - # TODO: don't force this to string. what if we want to send an array? content["m.matrixgpt"] = { "error": str(extra_error), "msg": str(extra_msg), diff --git a/requirements.txt b/requirements.txt index bc50008..4bb22f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ -matrix-nio[e2e]==0.24.0 -pyyaml==6.0.1 +matrix-nio==0.24.0 markdown==3.6 -python-olm==3.2.16 openai==1.16.2 anthropic==0.23.1 +pillow==10.3.0 git+https://git.evulid.cc/cyberes/bison.git