AutoPage

Runtime error

App Files Files Community

AutoPage / camel /toolkits /google_scholar_toolkit.py

Mqleet

upd code

fcaa164 about 2 months ago

raw

history blame contribute delete

7.43 kB

	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
	import re
	from typing import Any, Dict, List, Optional

	from camel.toolkits import FunctionTool
	from camel.toolkits.base import BaseToolkit


	class GoogleScholarToolkit(BaseToolkit):
	r"""A toolkit for retrieving information about authors and their
	publications from Google Scholar.

	Attributes:
	author_identifier (Union[str, None]): The author's Google Scholar URL
	or name of the author to search for.
	is_author_name (bool): Flag to indicate if the identifier is a name.
	(default: :obj:`False`)
	scholarly (module): The scholarly module for querying Google Scholar.
	author (Optional[Dict[str, Any]]): Cached author details, allowing
	manual assignment if desired.
	"""

	def __init__(
	self,
	author_identifier: str,
	is_author_name: bool = False,
	use_free_proxies: bool = False,
	proxy_http: Optional[str] = None,
	proxy_https: Optional[str] = None,
	) -> None:
	r"""Initializes the GoogleScholarToolkit with the author's identifier.

	Args:
	author_identifier (str): The author's Google Scholar URL or name
	of the author to search for.
	is_author_name (bool): Flag to indicate if the identifier is a
	name. (default: :obj:`False`)
	use_free_proxies (bool): Whether to use Free Proxies.
	(default: :obj:`False`)
	proxy_http ( Optional[str]): Proxy http address pass to pg.
	SingleProxy. (default: :obj:`None`)
	proxy_https ( Optional[str]): Proxy https address pass to pg.
	SingleProxy. (default: :obj:`None`)
	"""
	from scholarly import ProxyGenerator, scholarly

	# Set Free Proxies is needed
	if use_free_proxies:
	pg = ProxyGenerator()
	pg.FreeProxies()
	scholarly.use_proxy(pg)

	# Set Proxy is HTTP or HTTPS provided
	if proxy_http or proxy_https:
	pg = ProxyGenerator()
	pg.SingleProxy(http=proxy_http, https=proxy_https)
	scholarly.use_proxy(pg)

	self.scholarly = scholarly
	self.author_identifier = author_identifier
	self.is_author_name = is_author_name
	self._author: Optional[Dict[str, Any]] = None

	@property
	def author(self) -> Dict[str, Any]:
	r"""Getter for the author attribute, fetching details if not cached.

	Returns:
	Dict[str, Any]: A dictionary containing author details. If no data
	is available, returns an empty dictionary.
	"""
	if self._author is None:
	self.get_author_detailed_info()
	return self._author or {}

	@author.setter
	def author(self, value: Optional[Dict[str, Any]]) -> None:
	r"""Sets or overrides the cached author information.

	Args:
	value (Optional[Dict[str, Any]]): A dictionary containing author
	details to cache or `None` to clear the cached data.

	Raises:
	ValueError: If `value` is not a dictionary or `None`.
	"""
	if value is None or isinstance(value, dict):
	self._author = value
	else:
	raise ValueError("Author must be a dictionary or None.")

	def _extract_author_id(self) -> Optional[str]:
	r"""Extracts the author ID from a Google Scholar URL if provided.

	Returns:
	Optional[str]: The extracted author ID, or None if not found.
	"""
	match = re.search(r'user=([A-Za-z0-9-]+)', self.author_identifier)
	return match.group(1) if match else None

	def get_author_detailed_info(
	self,
	) -> dict:
	r"""Retrieves detailed information about the author.

	Returns:
	dict: A dictionary containing detailed information about the
	author.
	"""
	if self.is_author_name:
	search_query = self.scholarly.search_author(self.author_identifier)
	# Retrieve the first result from the iterator
	first_author_result = next(search_query)
	else:
	author_id = self._extract_author_id()
	first_author_result = self.scholarly.search_author_id(id=author_id)

	self._author = self.scholarly.fill(first_author_result)
	return self._author # type: ignore[return-value]

	def get_author_publications(
	self,
	) -> List[str]:
	r"""Retrieves the titles of the author's publications.

	Returns:
	List[str]: A list of publication titles authored by the author.
	"""
	publication_titles = [
	pub['bib']['title'] for pub in self.author['publications']
	]
	return publication_titles

	def get_publication_by_title(
	self, publication_title: str
	) -> Optional[dict]:
	r"""Retrieves detailed information about a specific publication by its
	title. Note that this method cannot retrieve the full content of the
	paper.

	Args:
	publication_title (str): The title of the publication to search
	for.

	Returns:
	Optional[dict]: A dictionary containing detailed information about
	the publication if found; otherwise, `None`.
	"""
	publications = self.author['publications']
	for publication in publications:
	if publication['bib']['title'] == publication_title:
	return self.scholarly.fill(publication)
	return None # Return None if not found

	def get_full_paper_content_by_link(self, pdf_url: str) -> Optional[str]:
	r"""Retrieves the full paper content from a given PDF URL using the
	arxiv2text tool.

	Args:
	pdf_url (str): The URL of the PDF file.

	Returns:
	Optional[str]: The full text extracted from the PDF, or `None` if
	an error occurs.
	"""
	from arxiv2text import arxiv_to_text

	try:
	return arxiv_to_text(pdf_url)
	except Exception:
	return None # Return None in case of any error

	def get_tools(self) -> List[FunctionTool]:
	r"""Returns a list of FunctionTool objects representing the
	functions in the toolkit.

	Returns:
	List[FunctionTool]: A list of FunctionTool objects
	representing the functions in the toolkit.
	"""
	return [
	FunctionTool(self.get_author_detailed_info),
	FunctionTool(self.get_author_publications),
	FunctionTool(self.get_publication_by_title),
	FunctionTool(self.get_full_paper_content_by_link),
	]