fix undetected-chromedriver issue, bump up read_webpage character limit, adjust tool descriptions

2024-03-07 18:49:41 -07:00 · 2024-03-07 18:49:41 -07:00 · 57d47eb7fe
parent f4848423b8
commit 57d47eb7fe
3 changed files with 8 additions and 6 deletions
--- a/pers/langchain/tools/browser.py
+++ b/pers/langchain/tools/browser.py
@ -15,7 +15,7 @@ def get_chrome_webdriver():
    chrome_options.add_argument("--test-type")
    chrome_options.add_argument('--ignore-certificate-errors')
    chrome_options.add_argument('--disable-extensions')
-    chrome_options.add_argument('disable-infobars')
+    chrome_options.add_argument('--disable-infobars')
    chrome_options.add_argument("--incognito")
    driver = undetected_chromedriver.Chrome(headless=True, options=chrome_options)
    return driver
@ -32,7 +32,7 @@ def render_webpage(url: str):

@tool('render_webpage')
 def render_webpage_tool(url: str, reasoning: str):
-    """Fetches the raw HTML of a webpage for use with the `retrieve_from_chroma` tool."""
+    """Fetches the raw HTML of a webpage for use with the `retrieve_from_chroma` tool. Best for when you need to do complicated parsing of a webpage or are dealing with very long pages."""
    if PRINT_USAGE:
        _print_func_call('render_webpage', {'url': url, 'reasoning': reasoning})
    html_source = render_webpage(url)
--- a/pers/langchain/tools/web_reader.py
+++ b/pers/langchain/tools/web_reader.py
@ -30,7 +30,7 @@ PUBLISH DATE: {publish_date}
 TOP_IMAGE_URL: {top_image}
 """

-MAX_RESULT_LENGTH_CHAR = 1000 * 4  # roughly 1,000 tokens
+MAX_RESULT_LENGTH_CHAR = 1000 * 6


 def page_result(text: str, cursor: int, max_length: int) -> str:
@ -121,7 +121,9 @@ class ReaderToolInput(BaseModel):

@tool(args_schema=ReaderToolInput)
 def read_webpage(url: str, reasoning: str, include_body: bool = True, cursor: int = 0):
-    """Fetch a webpage's text content. This function may not correctly parse complicated webpages, so use render_webpage if targeting specific HTML elements or expecting a complicated page."""
+    """Fetch a webpage's text content.
+    This tool trunucates the text content if it is longer than the context limit. You will see a line like `PAGE WAS TRUNCATED. TO CONTINUE READING, USE CURSOR=n.` when this happens, where `CURSOR=n` is the starting position for the next page. To continue reading, call this tool with the `cursor` argument with where you want to begin.
+    This function may not correctly parse complicated webpages, so use render_webpage if targeting specific HTML elements or expecting a complicated page.  Best for when you need to simply read the page."""
    if PRINT_USAGE:
        _print_func_call('read_webpage', {'url': url, 'reasoning': reasoning})

--- a/requirements.txt
+++ b/requirements.txt
@ -8,10 +8,10 @@ newspaper3k
 playwright
 beautifulsoup4==4.12.3
 chromedriver-autoinstaller==0.6.4
-undetected-chromedriver==3.5.4
+undetected-chromedriver==3.5.5
 redis==5.0.1
 async-timeout==4.0.3
 pyyaml==6.0.1
 py-cpuinfo==9.0.0
 psutil==5.9.8
-chromadb==0.4.22
+chromadb==0.4.22