Spaces:
Sleeping
Sleeping
Add tool to only extract parts of a website
Browse filesAdd tool `extract_website_content_parts` that does not return the hole website but only parts matching a regexp.
This is necessary to avoid exceeding the LLM's context limit. Just telling the model to produce the code
(i.e. what is implemented in `extract_website_content_parts`) does not work, the LLM produces code that always
first prints the whole contents, i.e. pushes the whole contents into the LLM context.
app.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import datetime
|
| 2 |
-
import requests
|
| 3 |
import pytz
|
|
|
|
|
|
|
|
|
|
| 4 |
import yaml
|
| 5 |
|
| 6 |
from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel, tool
|
|
@@ -24,6 +26,24 @@ def get_website_content(url: str) -> str:
|
|
| 24 |
except requests.RequestException as e:
|
| 25 |
return f"Error fetching website content: {str(e)}"
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
@tool
|
| 28 |
def get_papers_url_for_date(year:int, month:int, day:int)-> str:
|
| 29 |
"""A tool that constructs a URL where machine learning papers for a specific date (YYYY-MM-DD) are listed.
|
|
|
|
| 1 |
import datetime
|
|
|
|
| 2 |
import pytz
|
| 3 |
+
import re
|
| 4 |
+
import requests
|
| 5 |
+
from typing import List
|
| 6 |
import yaml
|
| 7 |
|
| 8 |
from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel, tool
|
|
|
|
| 26 |
except requests.RequestException as e:
|
| 27 |
return f"Error fetching website content: {str(e)}"
|
| 28 |
|
| 29 |
+
@tool
|
| 30 |
+
def extract_website_content_parts(url: str, extraction_pattern: str) -> List[str]:
|
| 31 |
+
"""
|
| 32 |
+
This tool extracts content parts matching the regular expression string `extraction_pattern` of a website given its `url`.
|
| 33 |
+
Args:
|
| 34 |
+
url: The URL of the website from which content parts should be extracted
|
| 35 |
+
extraction_pattern: The regular expression string of the content parts to extract from the website
|
| 36 |
+
Returns:
|
| 37 |
+
List[str]: The content parts matching extraction_pattern of the website `url`
|
| 38 |
+
"""
|
| 39 |
+
try:
|
| 40 |
+
response = requests.get(url)
|
| 41 |
+
response.raise_for_status()
|
| 42 |
+
matches: List[str] = re.findall(extraction_pattern, response.text)
|
| 43 |
+
return matches
|
| 44 |
+
except requests.RequestException as e:
|
| 45 |
+
return [f"Error fetching website content: {str(e)}"]
|
| 46 |
+
|
| 47 |
@tool
|
| 48 |
def get_papers_url_for_date(year:int, month:int, day:int)-> str:
|
| 49 |
"""A tool that constructs a URL where machine learning papers for a specific date (YYYY-MM-DD) are listed.
|