Spaces:
Sleeping
Sleeping
Deduplicate extracted content parts
Browse filesDeduplicate extracted content parts because every part showed up twice.
Add `extract_website_content_parts` to the `tools` list.
app.py
CHANGED
|
@@ -34,13 +34,13 @@ def extract_website_content_parts(url: str, extraction_pattern: str) -> List[str
|
|
| 34 |
url: The URL of the website from which content parts should be extracted
|
| 35 |
extraction_pattern: The regular expression string of the content parts to extract from the website
|
| 36 |
Returns:
|
| 37 |
-
List[str]: The content parts matching extraction_pattern of the website `url`
|
| 38 |
"""
|
| 39 |
try:
|
| 40 |
response = requests.get(url)
|
| 41 |
response.raise_for_status()
|
| 42 |
matches: List[str] = re.findall(extraction_pattern, response.text)
|
| 43 |
-
return matches
|
| 44 |
except requests.RequestException as e:
|
| 45 |
return [f"Error fetching website content: {str(e)}"]
|
| 46 |
|
|
@@ -92,7 +92,7 @@ with open("prompts.yaml", 'r') as stream:
|
|
| 92 |
|
| 93 |
agent = CodeAgent(
|
| 94 |
model=model,
|
| 95 |
-
tools=[final_answer, search_tool, get_website_content, get_papers_url_for_date, get_current_time_in_timezone],
|
| 96 |
max_steps=30,
|
| 97 |
verbosity_level=1,
|
| 98 |
grammar=None,
|
|
|
|
| 34 |
url: The URL of the website from which content parts should be extracted
|
| 35 |
extraction_pattern: The regular expression string of the content parts to extract from the website
|
| 36 |
Returns:
|
| 37 |
+
List[str]: The deduplicated content parts matching extraction_pattern of the website `url`
|
| 38 |
"""
|
| 39 |
try:
|
| 40 |
response = requests.get(url)
|
| 41 |
response.raise_for_status()
|
| 42 |
matches: List[str] = re.findall(extraction_pattern, response.text)
|
| 43 |
+
return list(set(matches))
|
| 44 |
except requests.RequestException as e:
|
| 45 |
return [f"Error fetching website content: {str(e)}"]
|
| 46 |
|
|
|
|
| 92 |
|
| 93 |
agent = CodeAgent(
|
| 94 |
model=model,
|
| 95 |
+
tools=[final_answer, search_tool, extract_website_content_parts, get_website_content, get_papers_url_for_date, get_current_time_in_timezone],
|
| 96 |
max_steps=30,
|
| 97 |
verbosity_level=1,
|
| 98 |
grammar=None,
|