Spaces:
Runtime error
Runtime error
| # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
| import re | |
| from typing import Any, Dict, List, Optional | |
| from camel.toolkits import FunctionTool | |
| from camel.toolkits.base import BaseToolkit | |
| class GoogleScholarToolkit(BaseToolkit): | |
| r"""A toolkit for retrieving information about authors and their | |
| publications from Google Scholar. | |
| Attributes: | |
| author_identifier (Union[str, None]): The author's Google Scholar URL | |
| or name of the author to search for. | |
| is_author_name (bool): Flag to indicate if the identifier is a name. | |
| (default: :obj:`False`) | |
| scholarly (module): The scholarly module for querying Google Scholar. | |
| author (Optional[Dict[str, Any]]): Cached author details, allowing | |
| manual assignment if desired. | |
| """ | |
| def __init__( | |
| self, | |
| author_identifier: str, | |
| is_author_name: bool = False, | |
| use_free_proxies: bool = False, | |
| proxy_http: Optional[str] = None, | |
| proxy_https: Optional[str] = None, | |
| ) -> None: | |
| r"""Initializes the GoogleScholarToolkit with the author's identifier. | |
| Args: | |
| author_identifier (str): The author's Google Scholar URL or name | |
| of the author to search for. | |
| is_author_name (bool): Flag to indicate if the identifier is a | |
| name. (default: :obj:`False`) | |
| use_free_proxies (bool): Whether to use Free Proxies. | |
| (default: :obj:`False`) | |
| proxy_http ( Optional[str]): Proxy http address pass to pg. | |
| SingleProxy. (default: :obj:`None`) | |
| proxy_https ( Optional[str]): Proxy https address pass to pg. | |
| SingleProxy. (default: :obj:`None`) | |
| """ | |
| from scholarly import ProxyGenerator, scholarly | |
| # Set Free Proxies is needed | |
| if use_free_proxies: | |
| pg = ProxyGenerator() | |
| pg.FreeProxies() | |
| scholarly.use_proxy(pg) | |
| # Set Proxy is HTTP or HTTPS provided | |
| if proxy_http or proxy_https: | |
| pg = ProxyGenerator() | |
| pg.SingleProxy(http=proxy_http, https=proxy_https) | |
| scholarly.use_proxy(pg) | |
| self.scholarly = scholarly | |
| self.author_identifier = author_identifier | |
| self.is_author_name = is_author_name | |
| self._author: Optional[Dict[str, Any]] = None | |
| def author(self) -> Dict[str, Any]: | |
| r"""Getter for the author attribute, fetching details if not cached. | |
| Returns: | |
| Dict[str, Any]: A dictionary containing author details. If no data | |
| is available, returns an empty dictionary. | |
| """ | |
| if self._author is None: | |
| self.get_author_detailed_info() | |
| return self._author or {} | |
| def author(self, value: Optional[Dict[str, Any]]) -> None: | |
| r"""Sets or overrides the cached author information. | |
| Args: | |
| value (Optional[Dict[str, Any]]): A dictionary containing author | |
| details to cache or `None` to clear the cached data. | |
| Raises: | |
| ValueError: If `value` is not a dictionary or `None`. | |
| """ | |
| if value is None or isinstance(value, dict): | |
| self._author = value | |
| else: | |
| raise ValueError("Author must be a dictionary or None.") | |
| def _extract_author_id(self) -> Optional[str]: | |
| r"""Extracts the author ID from a Google Scholar URL if provided. | |
| Returns: | |
| Optional[str]: The extracted author ID, or None if not found. | |
| """ | |
| match = re.search(r'user=([A-Za-z0-9-]+)', self.author_identifier) | |
| return match.group(1) if match else None | |
| def get_author_detailed_info( | |
| self, | |
| ) -> dict: | |
| r"""Retrieves detailed information about the author. | |
| Returns: | |
| dict: A dictionary containing detailed information about the | |
| author. | |
| """ | |
| if self.is_author_name: | |
| search_query = self.scholarly.search_author(self.author_identifier) | |
| # Retrieve the first result from the iterator | |
| first_author_result = next(search_query) | |
| else: | |
| author_id = self._extract_author_id() | |
| first_author_result = self.scholarly.search_author_id(id=author_id) | |
| self._author = self.scholarly.fill(first_author_result) | |
| return self._author # type: ignore[return-value] | |
| def get_author_publications( | |
| self, | |
| ) -> List[str]: | |
| r"""Retrieves the titles of the author's publications. | |
| Returns: | |
| List[str]: A list of publication titles authored by the author. | |
| """ | |
| publication_titles = [ | |
| pub['bib']['title'] for pub in self.author['publications'] | |
| ] | |
| return publication_titles | |
| def get_publication_by_title( | |
| self, publication_title: str | |
| ) -> Optional[dict]: | |
| r"""Retrieves detailed information about a specific publication by its | |
| title. Note that this method cannot retrieve the full content of the | |
| paper. | |
| Args: | |
| publication_title (str): The title of the publication to search | |
| for. | |
| Returns: | |
| Optional[dict]: A dictionary containing detailed information about | |
| the publication if found; otherwise, `None`. | |
| """ | |
| publications = self.author['publications'] | |
| for publication in publications: | |
| if publication['bib']['title'] == publication_title: | |
| return self.scholarly.fill(publication) | |
| return None # Return None if not found | |
| def get_full_paper_content_by_link(self, pdf_url: str) -> Optional[str]: | |
| r"""Retrieves the full paper content from a given PDF URL using the | |
| arxiv2text tool. | |
| Args: | |
| pdf_url (str): The URL of the PDF file. | |
| Returns: | |
| Optional[str]: The full text extracted from the PDF, or `None` if | |
| an error occurs. | |
| """ | |
| from arxiv2text import arxiv_to_text | |
| try: | |
| return arxiv_to_text(pdf_url) | |
| except Exception: | |
| return None # Return None in case of any error | |
| def get_tools(self) -> List[FunctionTool]: | |
| r"""Returns a list of FunctionTool objects representing the | |
| functions in the toolkit. | |
| Returns: | |
| List[FunctionTool]: A list of FunctionTool objects | |
| representing the functions in the toolkit. | |
| """ | |
| return [ | |
| FunctionTool(self.get_author_detailed_info), | |
| FunctionTool(self.get_author_publications), | |
| FunctionTool(self.get_publication_by_title), | |
| FunctionTool(self.get_full_paper_content_by_link), | |
| ] | |