LCAS · marc-hanheide · Jan 31, 2026 · Jan 31, 2026 · Jan 31, 2026 · Jan 31, 2026
diff --git a/.github/workflows/lcas-figshare-processing.yaml b/.github/workflows/lcas-figshare-processing.yaml
@@ -71,10 +71,10 @@ jobs:
           cd ./output
           if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ github.event.inputs.use_author_cache }}" = "true" ]; then
             echo "Running figshare_fetch.py with --use-author-cache (manually triggered)"
-            python ../figshare_fetch.py --use-author-cache -f ../lcas-authors.txt --max-retries 30 --rate-limit-delay 0.1
+            python ../figshare_fetch.py --use-author-cache -c ../lcas-authors.yaml --max-retries 30 --rate-limit-delay 0.1
           else
             echo "Running figshare_fetch.py without cache (default behavior)"
-            python ../figshare_fetch.py -f ../lcas-authors.txt --max-retries 30 --rate-limit-delay 0.1
+            python ../figshare_fetch.py -c ../lcas-authors.yaml --max-retries 30 --rate-limit-delay 0.1
           fi
 
       - name: Run figshare bibtex (Step 2 - Generate bibtex from CSV)

diff --git a/.github/workflows/uoa11-figshare-processing.yaml b/.github/workflows/uoa11-figshare-processing.yaml
@@ -71,10 +71,10 @@ jobs:
           cd ./output
           if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ github.event.inputs.use_author_cache }}" = "true" ]; then
             echo "Running figshare_fetch.py with --use-author-cache (manually triggered)"
-            python ../figshare_fetch.py -f ../uoa11-authors.txt --use-author-cache --max-retries 30 --rate-limit-delay 0.1 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
+            python ../figshare_fetch.py -c ../uoa11-authors.yaml --use-author-cache --max-retries 30 --rate-limit-delay 0.1 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
           else
             echo "Running figshare_fetch.py without cache (default behavior)"
-            python ../figshare_fetch.py -f ../uoa11-authors.txt --rate-limit-delay 0.1 --max-retries 30 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
+            python ../figshare_fetch.py -c ../uoa11-authors.yaml --rate-limit-delay 0.1 --max-retries 30 -o uoa11-figshare_articles.csv -O uoa11-figshare_articles_all.csv
           fi
 
       - name: Run figshare bibtex (Step 2 - Generate bibtex from CSV)

diff --git a/author.py b/author.py
@@ -13,24 +13,54 @@
 
 
 class Author:
-    def __init__(self, name, debug=False, rate_limit_delay=1.0, max_retries=5):
+    """Represents an author and manages their Figshare article collection.
+
+    This class handles retrieving, processing, and caching article metadata for
+    a specific author from the Figshare repository.
+    """
+
+    def __init__(self, name, user_id=None, institution_id=None, orcid=None, debug=False, rate_limit_delay=1.0, max_retries=5):
+        """Initialize an Author instance.
+
+        Args:
+            name: Author's full name (required)
+            user_id: Figshare user ID (optional, improves search accuracy)
+            institution_id: Institution ID for filtering articles (optional, recommended)
+            orcid: Author's ORCID identifier (optional, for reference)
+            debug: Enable debug logging (default: False)
+            rate_limit_delay: Delay in seconds between API requests (default: 1.0)
+            max_retries: Maximum retry attempts for failed API calls (default: 5)
+        """
         self.logger = getLogger("Author")
         if debug:
             self.logger.setLevel(DEBUG)
         self.name = name
+        self.user_id = user_id
+        self.institution_id = institution_id
+        self.orcid = orcid
         self.fs = FigShare(rate_limit_delay=rate_limit_delay, max_retries=max_retries)
         self.articles = {}
         self.public_html_prefix = "https://repository.lincoln.ac.uk"
         self.df = None
 
     def save(self, filename=None):
+        """Save author's articles and dataframe to a persistent cache file.
+
+        Args:
+            filename: Path to cache file (default: '{author_name}.db')
+        """
         if filename is None:
             filename = f"{self.name}.db"
         with shelve.open(filename) as db:
             db['articles'] = self.articles
             db['df'] = self.df
 
     def load(self, filename=None):
+        """Load author's articles and dataframe from a persistent cache file.
+
+        Args:
+            filename: Path to cache file (default: '{author_name}.db')
+        """
         if filename is None:
             filename = f"{self.name}.db"
         with shelve.open(filename) as db:
@@ -39,12 +69,47 @@ def load(self, filename=None):
 
 
     def _retrieve_figshare(self, use_cache=True):
+        """Retrieve articles for this author from Figshare.
+
+        Uses the most precise search method available based on the author metadata:
+        - If user_id and/or institution_id are available, uses articles_by_author()
+          with filtering for more accurate results
+        - Otherwise, falls back to simple name-based search
+
+        Args:
+            use_cache: Whether to use cached API results (default: True)
+        """
         self.logger.info(f"retrieving articles for {self.name}")
-        self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache)
+
+        # Use enhanced search with user_id tracking and institution filtering when available
+        if self.user_id or self.institution_id:
+            if self.user_id and self.institution_id:
+                self.logger.info(f"Using enhanced search for user_id {self.user_id} with institution_id {self.institution_id}")
+            elif self.user_id:
+                self.logger.info(f"Using enhanced search for user_id {self.user_id}")
+            else:
+                self.logger.info(f"Using enhanced search with institution_id {self.institution_id}")
+
+            self.articles = self.fs.articles_by_author(
+                self.name,
+                user_id=self.user_id,
+                institution_id=self.institution_id,
+                use_cache=use_cache
+            )
+        else:
+            self.logger.info(f"Using basic name search (no user_id or institution_id available)") 
+            self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache)
 
         self.logger.info(f"found {len(self.articles)} articles for {self.name}")
 
     def _retrieve_details(self, use_cache=True):
+        """Retrieve detailed metadata for each article.
+
+        Fetches full article details including custom fields, tags, categories, etc.
+
+        Args:
+            use_cache: Whether to use cached API results (default: True)
+        """
         for article in self.articles:
             self.logger.info(f"retrieving details for article {article['id']}")
             article['details'] = self.fs.get_article(article['id'], use_cache=use_cache)
@@ -122,9 +187,13 @@ def _retrieve_bibtex_from_dois(self):
                 self.logger.warning(f"Failed to get bibtex for {doi}: {e}")
 
     def _flatten(self):
+        self.logger.info(f"flattening article dicts for {self.name}")
         new_articles = []
         for a in self.articles:
-            new_articles.append(flatten(a, reducer='path'))
+            try:
+                new_articles.append(flatten(a, reducer='path'))
+            except Exception as e:
+                self.logger.warning(f"Failed to flatten article {a}: {e}")
         self.articles = new_articles
 
     def retrieve(self, use_cache=True):
@@ -144,18 +213,27 @@ def _create_dataframe(self):
         self.df = pd.DataFrame.from_dict(self.articles)
         # add column with author name
         self.df['author'] = self.name
+
         # add column with online date (as datetime object)
-        self.df['online_date'] = pd.to_datetime(self.df['timeline/firstOnline'], utc=True)
+        if 'timeline/firstOnline' in self.df.columns:
+            self.df['online_date'] = pd.to_datetime(self.df['timeline/firstOnline'], utc=True)
+        else:
+            self.logger.warning(f"'timeline/firstOnline' field not found, setting online_date to NaT")
+            self.df['online_date'] = pd.NaT
+
         # add column with online year
         self.df['online_year'] = self.df['online_date'].apply(
-            lambda x: x.year
+            lambda x: x.year if pd.notna(x) else None
         )
+
         # add column with external DOI, parsed from custom_fields
-        self.df['External DOI'] = self.df['details/custom_fields/External DOI'].apply(
-            lambda x: re.sub(r'^(?:https?://doi\.org/|doi:)', '', x[0], flags=re.IGNORECASE).replace('doi:','')
-            if isinstance(x, list) and len(x) > 0 else None
-        )
-
-
+        if 'details/custom_fields/External DOI' in self.df.columns:
+            self.df['External DOI'] = self.df['details/custom_fields/External DOI'].apply(
+                lambda x: re.sub(r'^(?:https?://doi\.org/|doi:)', '', x[0], flags=re.IGNORECASE).replace('doi:','')
+                if isinstance(x, list) and len(x) > 0 else None
+            )
+        else:
+            self.logger.warning(f"'details/custom_fields/External DOI' field not found, setting External DOI to None")
+            self.df['External DOI'] = None
 
         return self.df
diff --git a/figshare_api.py b/figshare_api.py
@@ -139,8 +139,80 @@ def __post(self, url, params=None, use_cache=True):
                 self.logger.warning(f"Received empty or invalid JSON response for POST {self.base_url + url} (status: {response.status_code})")
                 return []
 
+    def articles_by_author(self, author_name, user_id=None, institution_id=None, use_cache=True):
+        """Search for articles by author name with optional institution filtering.
 
+        Uses the Figshare search API with the :author: search operator and optional
+        institution parameter. Note: Figshare's search API does not support searching
+        by author_id directly, so we use the author name for search and apply
+        institution filtering to narrow results.
+
+        Args:
+            author_name: The author's full name to search for (required)
+            user_id: Figshare user ID (optional, used only for logging/reference)
+            institution_id: Institution ID to filter articles (optional, recommended
+                          for more precise results when available)
+            use_cache: Whether to use cached results (default: True)
+
+        Returns:
+            List of article dictionaries matching the search criteria. Each article
+            contains metadata like id, title, authors, DOI, etc.
+
+        Example:
+            articles = fs.articles_by_author(
+                "Marc Hanheide", 
+                user_id=17159320,
+                institution_id=1068
+            )
+        """
+        params = self.__init_params()
+
+        # Use :author: search operator with author name
+        # This is the only reliable way to search by author in Figshare API
+        params["search_for"] = f':author: "{author_name}"'
+
+        # Add institution filter as direct parameter if provided
+        # This significantly narrows results when multiple authors share the same name
+        if institution_id:
+            params["institution"] = institution_id
+            self.logger.info(f"Filtering by institution_id: {institution_id}")
+
+        # Paginate through all results
+        page = 1
+        articles = []
+        while True:
+            params["page"] = page
+            if user_id:
+                self.logger.info(f"retrieving page {page} for {author_name} (user_id: {user_id})")
+            else:
+                self.logger.info(f"retrieving page {page} for {author_name}")
+            current_page_articles = self.__post("/articles/search", params=params, use_cache=use_cache)
+            page += 1
+            if len(current_page_articles) == 0:
+                break
+            articles += current_page_articles
+
+        if user_id:
+            self.logger.info(f"found {len(articles)} articles for {author_name} (user_id: {user_id})")
+        else:
+            self.logger.info(f"found {len(articles)} articles for {author_name}")
+
+        return articles
+
     def articles_by_user_name(self, user_name, use_cache=True):
+        """Search for articles by author name without additional filtering.
+
+        This is a simpler version of articles_by_author() without institution
+        filtering or user_id tracking. Use articles_by_author() for more precise
+        searches when institution_id is available.
+
+        Args:
+            user_name: The author's full name to search for
+            use_cache: Whether to use cached results (default: True)
+
+        Returns:
+            List of article dictionaries matching the author name
+        """
         params = self.__init_params()
         params["search_for"] = f':author: \"{user_name}\"'
         page = 1
@@ -159,3 +231,17 @@ def articles_by_user_name(self, user_name, use_cache=True):
 
     def get_article(self, article_id, use_cache=True):
         return self.__get(f"/articles/{article_id}", use_cache=use_cache)
+
+    def search_authors(self, params, use_cache=True):
+        """Search for authors using the Figshare account API.
+
+        Args:
+            params: Dictionary with search parameters (search, orcid, is_active, 
+                   is_public, group_id, institution_id)
+            use_cache: Whether to use cached results
+
+        Returns:
+            List of author dictionaries matching the search criteria
+        """
+        self.logger.info(f"Searching for authors with params: {params}")
+        return self.__post("/account/authors/search", params=params, use_cache=use_cache)