Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/lcas-figshare-processing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ jobs:
cd ./output
if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ github.event.inputs.use_author_cache }}" = "true" ]; then
echo "Running figshare_fetch.py with --use-author-cache (manually triggered)"
python ../figshare_fetch.py --use-author-cache -f ../lcas-authors.txt --max-retries 30 --rate-limit-delay 0.1
python ../figshare_fetch.py --use-author-cache -c ../lcas-authors.yaml --max-retries 30 --rate-limit-delay 0.1
else
echo "Running figshare_fetch.py without cache (default behavior)"
python ../figshare_fetch.py -f ../lcas-authors.txt --max-retries 30 --rate-limit-delay 0.1
python ../figshare_fetch.py -c ../lcas-authors.yaml --max-retries 30 --rate-limit-delay 0.1
fi

- name: Run figshare bibtex (Step 2 - Generate bibtex from CSV)
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/uoa11-figshare-processing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ jobs:
cd ./output
if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ github.event.inputs.use_author_cache }}" = "true" ]; then
echo "Running figshare_fetch.py with --use-author-cache (manually triggered)"
python ../figshare_fetch.py -f ../uoa11-authors.txt --use-author-cache --max-retries 30 --rate-limit-delay 0.1 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
python ../figshare_fetch.py -c ../uoa11-authors.yaml --use-author-cache --max-retries 30 --rate-limit-delay 0.1 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
else
echo "Running figshare_fetch.py without cache (default behavior)"
python ../figshare_fetch.py -f ../uoa11-authors.txt --rate-limit-delay 0.1 --max-retries 30 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
python ../figshare_fetch.py -c ../uoa11-authors.yaml --rate-limit-delay 0.1 --max-retries 30 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
fi

- name: Run figshare bibtex (Step 2 - Generate bibtex from CSV)
Expand Down
100 changes: 89 additions & 11 deletions author.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,54 @@


class Author:
def __init__(self, name, debug=False, rate_limit_delay=1.0, max_retries=5):
"""Represents an author and manages their Figshare article collection.

This class handles retrieving, processing, and caching article metadata for
a specific author from the Figshare repository.
"""

def __init__(self, name, user_id=None, institution_id=None, orcid=None, debug=False, rate_limit_delay=1.0, max_retries=5):
"""Initialize an Author instance.

Args:
name: Author's full name (required)
user_id: Figshare user ID (optional, improves search accuracy)
institution_id: Institution ID for filtering articles (optional, recommended)
orcid: Author's ORCID identifier (optional, for reference)
debug: Enable debug logging (default: False)
rate_limit_delay: Delay in seconds between API requests (default: 1.0)
max_retries: Maximum retry attempts for failed API calls (default: 5)
"""
self.logger = getLogger("Author")
if debug:
self.logger.setLevel(DEBUG)
self.name = name
self.user_id = user_id
self.institution_id = institution_id
self.orcid = orcid
self.fs = FigShare(rate_limit_delay=rate_limit_delay, max_retries=max_retries)
self.articles = {}
self.public_html_prefix = "https://repository.lincoln.ac.uk"
self.df = None

def save(self, filename=None):
"""Save author's articles and dataframe to a persistent cache file.

Args:
filename: Path to cache file (default: '{author_name}.db')
"""
if filename is None:
filename = f"{self.name}.db"
with shelve.open(filename) as db:
db['articles'] = self.articles
db['df'] = self.df

def load(self, filename=None):
"""Load author's articles and dataframe from a persistent cache file.

Args:
filename: Path to cache file (default: '{author_name}.db')
"""
if filename is None:
filename = f"{self.name}.db"
with shelve.open(filename) as db:
Expand All @@ -39,12 +69,47 @@ def load(self, filename=None):


def _retrieve_figshare(self, use_cache=True):
"""Retrieve articles for this author from Figshare.

Uses the most precise search method available based on the author metadata:
- If user_id and/or institution_id are available, uses articles_by_author()
with filtering for more accurate results
- Otherwise, falls back to simple name-based search

Args:
use_cache: Whether to use cached API results (default: True)
"""
self.logger.info(f"retrieving articles for {self.name}")
self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache)

# Use enhanced search with user_id tracking and institution filtering when available
if self.user_id or self.institution_id:
if self.user_id and self.institution_id:
self.logger.info(f"Using enhanced search for user_id {self.user_id} with institution_id {self.institution_id}")
elif self.user_id:
self.logger.info(f"Using enhanced search for user_id {self.user_id}")
else:
self.logger.info(f"Using enhanced search with institution_id {self.institution_id}")

self.articles = self.fs.articles_by_author(
self.name,
user_id=self.user_id,
institution_id=self.institution_id,
use_cache=use_cache
)
else:
self.logger.info(f"Using basic name search (no user_id or institution_id available)")
self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache)

self.logger.info(f"found {len(self.articles)} articles for {self.name}")

def _retrieve_details(self, use_cache=True):
"""Retrieve detailed metadata for each article.

Fetches full article details including custom fields, tags, categories, etc.

Args:
use_cache: Whether to use cached API results (default: True)
"""
for article in self.articles:
self.logger.info(f"retrieving details for article {article['id']}")
article['details'] = self.fs.get_article(article['id'], use_cache=use_cache)
Expand Down Expand Up @@ -122,9 +187,13 @@ def _retrieve_bibtex_from_dois(self):
self.logger.warning(f"Failed to get bibtex for {doi}: {e}")

def _flatten(self):
self.logger.info(f"flattening article dicts for {self.name}")
new_articles = []
for a in self.articles:
new_articles.append(flatten(a, reducer='path'))
try:
new_articles.append(flatten(a, reducer='path'))
except Exception as e:
self.logger.warning(f"Failed to flatten article {a}: {e}")
self.articles = new_articles

def retrieve(self, use_cache=True):
Expand All @@ -144,18 +213,27 @@ def _create_dataframe(self):
self.df = pd.DataFrame.from_dict(self.articles)
# add column with author name
self.df['author'] = self.name

# add column with online date (as datetime object)
self.df['online_date'] = pd.to_datetime(self.df['timeline/firstOnline'], utc=True)
if 'timeline/firstOnline' in self.df.columns:
self.df['online_date'] = pd.to_datetime(self.df['timeline/firstOnline'], utc=True)
else:
self.logger.warning(f"'timeline/firstOnline' field not found, setting online_date to NaT")
self.df['online_date'] = pd.NaT

# add column with online year
self.df['online_year'] = self.df['online_date'].apply(
lambda x: x.year
lambda x: x.year if pd.notna(x) else None
)

# add column with external DOI, parsed from custom_fields
self.df['External DOI'] = self.df['details/custom_fields/External DOI'].apply(
lambda x: re.sub(r'^(?:https?://doi\.org/|doi:)', '', x[0], flags=re.IGNORECASE).replace('doi:','')
if isinstance(x, list) and len(x) > 0 else None
)


if 'details/custom_fields/External DOI' in self.df.columns:
self.df['External DOI'] = self.df['details/custom_fields/External DOI'].apply(
lambda x: re.sub(r'^(?:https?://doi\.org/|doi:)', '', x[0], flags=re.IGNORECASE).replace('doi:','')
if isinstance(x, list) and len(x) > 0 else None
)
else:
self.logger.warning(f"'details/custom_fields/External DOI' field not found, setting External DOI to None")
self.df['External DOI'] = None

return self.df
86 changes: 86 additions & 0 deletions figshare_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,80 @@ def __post(self, url, params=None, use_cache=True):
self.logger.warning(f"Received empty or invalid JSON response for POST {self.base_url + url} (status: {response.status_code})")
return []

def articles_by_author(self, author_name, user_id=None, institution_id=None, use_cache=True):
"""Search for articles by author name with optional institution filtering.

Uses the Figshare search API with the :author: search operator and optional
institution parameter. Note: Figshare's search API does not support searching
by author_id directly, so we use the author name for search and apply
institution filtering to narrow results.

Args:
author_name: The author's full name to search for (required)
user_id: Figshare user ID (optional, used only for logging/reference)
institution_id: Institution ID to filter articles (optional, recommended
for more precise results when available)
use_cache: Whether to use cached results (default: True)

Returns:
List of article dictionaries matching the search criteria. Each article
contains metadata like id, title, authors, DOI, etc.

Example:
articles = fs.articles_by_author(
"Marc Hanheide",
user_id=17159320,
institution_id=1068
)
"""
params = self.__init_params()

# Use :author: search operator with author name
# This is the only reliable way to search by author in Figshare API
params["search_for"] = f':author: "{author_name}"'

# Add institution filter as direct parameter if provided
# This significantly narrows results when multiple authors share the same name
if institution_id:
params["institution"] = institution_id
self.logger.info(f"Filtering by institution_id: {institution_id}")

# Paginate through all results
page = 1
articles = []
while True:
params["page"] = page
if user_id:
self.logger.info(f"retrieving page {page} for {author_name} (user_id: {user_id})")
else:
self.logger.info(f"retrieving page {page} for {author_name}")
current_page_articles = self.__post("/articles/search", params=params, use_cache=use_cache)
page += 1
if len(current_page_articles) == 0:
break
articles += current_page_articles

if user_id:
self.logger.info(f"found {len(articles)} articles for {author_name} (user_id: {user_id})")
else:
self.logger.info(f"found {len(articles)} articles for {author_name}")

return articles

def articles_by_user_name(self, user_name, use_cache=True):
"""Search for articles by author name without additional filtering.

This is a simpler version of articles_by_author() without institution
filtering or user_id tracking. Use articles_by_author() for more precise
searches when institution_id is available.

Args:
user_name: The author's full name to search for
use_cache: Whether to use cached results (default: True)

Returns:
List of article dictionaries matching the author name
"""
params = self.__init_params()
params["search_for"] = f':author: \"{user_name}\"'
page = 1
Expand All @@ -159,3 +231,17 @@ def articles_by_user_name(self, user_name, use_cache=True):

def get_article(self, article_id, use_cache=True):
return self.__get(f"/articles/{article_id}", use_cache=use_cache)

def search_authors(self, params, use_cache=True):
"""Search for authors using the Figshare account API.

Args:
params: Dictionary with search parameters (search, orcid, is_active,
is_public, group_id, institution_id)
use_cache: Whether to use cached results

Returns:
List of author dictionaries matching the search criteria
"""
self.logger.info(f"Searching for authors with params: {params}")
return self.__post("/account/authors/search", params=params, use_cache=use_cache)
Loading
Loading