First_agent_template

Sleeping

App Files Files Community

dball commited on Feb 19

Commit

9caf3d5

verified ·

1 Parent(s): 0e1c853

Add tool to only extract parts of a website

Browse files

Add tool `extract_website_content_parts` that does not return the hole website but only parts matching a regexp.

This is necessary to avoid exceeding the LLM's context limit. Just telling the model to produce the code
(i.e. what is implemented in `extract_website_content_parts`) does not work, the LLM produces code that always
first prints the whole contents, i.e. pushes the whole contents into the LLM context.

Files changed (1) hide show

app.py +21 -1

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import datetime
-import requests
 import pytz
 import yaml
 from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel, tool
@@ -24,6 +26,24 @@ def get_website_content(url: str) -> str:
     except requests.RequestException as e:
         return f"Error fetching website content: {str(e)}"
 @tool
 def get_papers_url_for_date(year:int, month:int, day:int)-> str:
     """A tool that constructs a URL where machine learning papers for a specific date (YYYY-MM-DD) are listed.

 import datetime
 import pytz
+import re
+import requests
+from typing import List
 import yaml
 from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel, tool
     except requests.RequestException as e:
         return f"Error fetching website content: {str(e)}"
+@tool
+def extract_website_content_parts(url: str, extraction_pattern: str) -> List[str]:
+    """
+    This tool extracts content parts matching the regular expression string `extraction_pattern` of a website given its `url`.
+    Args:
+        url: The URL of the website from which content parts should be extracted
+        extraction_pattern: The regular expression string of the content parts to extract from the website
+    Returns:
+        List[str]: The content parts matching extraction_pattern of the website `url`
+    """
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        matches: List[str] = re.findall(extraction_pattern, response.text)
+        return matches
+    except requests.RequestException as e:
+        return [f"Error fetching website content: {str(e)}"]
 @tool
 def get_papers_url_for_date(year:int, month:int, day:int)-> str:
     """A tool that constructs a URL where machine learning papers for a specific date (YYYY-MM-DD) are listed.