From 4224fef9ddcba7cd5dc7fbd29fde907a23df15f5 Mon Sep 17 00:00:00 2001
From: Jeremy Stein <j.stein@ucl.ac.uk>
Date: Fri, 23 Jan 2026 15:24:56 +0000
Subject: [PATCH 1/6] Switch to real hashing

---
 .gitignore                          |  3 +++
 .pre-commit-config.yaml             |  3 ++-
 README.md                           | 22 +++++++++++++++++
 config.EXAMPLE/.env.EXAMPLE         |  5 ++++
 config.EXAMPLE/exporter.env.EXAMPLE |  6 +++++
 config.EXAMPLE/hasher.env.EXAMPLE   | 12 ++++++----
 docker-compose.yml                  |  2 +-
 pyproject.toml                      |  2 +-
 src/pseudon/hashing.py              | 37 ++++++++++++++++++++++-------
 src/settings.py                     |  3 +++
 10 files changed, 79 insertions(+), 16 deletions(-)
 create mode 100644 config.EXAMPLE/.env.EXAMPLE

diff --git a/.gitignore b/.gitignore
index be3f5ce..a44b7dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,6 @@ wheels/
 
 # settings files (should not be in the source tree anyway, but just in case)
 *.env
+
+# snakemake tracking files
+.snakemake
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d4b0fd9..a217d81 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,8 @@ repos:
           [
             "pandas-stubs",
             "types-psycopg2",
-            "types-pika"
+            "types-pika",
+            "types-requests",
           ]
         files: src/
   # a collection of sanity checks: check for merge conflicts, check the end of
diff --git a/README.md b/README.md
index 4743482..77dc783 100644
--- a/README.md
+++ b/README.md
@@ -48,18 +48,38 @@ separate to the Emap project root.
 
 ### Instructions for achieving this structure
 
+
+#### Clone repos
 Clone this repo (`waveform-controller`) and [PIXL](https://github.com/SAFEHR-data/PIXL),
 both inside your root directory.
 
+#### make config files
 Set up the config files as follows:
 ```
 mkdir config
 cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env
 cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env
 cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env
+cp waveform-controller/config.EXAMPLE/.env.EXAMPLE waveform-controller/.env
 ```
 From the new config files, remove the comments telling you not to put secrets in it, as instructed.
 
+#### fill in config files
+Fill out the config, as appropriate.
+
+Tip: HASHER_API_LISTEN_PORT should match HASHER_API_PORT if you are running your own instance of the
+PIXL hasher (as things stand, we are doing so).
+
+When updating to a new version of this code, you should diff the .EXAMPLE file against its live version,
+eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`.
+
+This checks if any config options have been added/removed from the .EXAMPLE, and thus should be
+added/removed from the live file.
+
+> [!CAUTION]
+> Be careful not to copy sensitive data from the live config file to the .EXAMPLE file!
+
+#### make necessary directories
 If it doesn't already exist you should create a directory named
 `waveform-export` in the parent directory to store the saved waveform
 messages.
@@ -68,6 +88,8 @@ messages.
 mkdir waveform-export
 ```
 
+#### run it!
+
 Build and start the controller and exporter with docker
 ```
 cd waveform-controller
diff --git a/config.EXAMPLE/.env.EXAMPLE b/config.EXAMPLE/.env.EXAMPLE
new file mode 100644
index 0000000..bd473cc
--- /dev/null
+++ b/config.EXAMPLE/.env.EXAMPLE
@@ -0,0 +1,5 @@
+# This is an EXAMPLE file, do not put real secrets in here.
+# Copy it to .env and then DELETE THIS COMMENT.
+# (This is a bit different from the others, which live in config.
+# This one is just for the variables in the docker compose file)
+HASHER_API_LISTEN_PORT=
diff --git a/config.EXAMPLE/exporter.env.EXAMPLE b/config.EXAMPLE/exporter.env.EXAMPLE
index 7be714c..14a601d 100644
--- a/config.EXAMPLE/exporter.env.EXAMPLE
+++ b/config.EXAMPLE/exporter.env.EXAMPLE
@@ -2,7 +2,13 @@
 # Copy it to ../config/exporter.env and then DELETE THIS COMMENT.
 # When does the exporter run
 EXPORTER_CRON_SCHEDULE="14 5 * * *"
+# Where to upload via FTPS
 FTPS_HOST=myftps.example.com
 FTPS_PORT=990
 FTPS_USERNAME=
 FTPS_PASSWORD=
+# only run workflow up to and including the specified rule
+SNAKEMAKE_RULE_UNTIL=
+# point to the hasher we wish to use
+HASHER_API_HOSTNAME=waveform-hasher
+HASHER_API_PORT=
diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE
index 1cd6ad0..ad79b7a 100644
--- a/config.EXAMPLE/hasher.env.EXAMPLE
+++ b/config.EXAMPLE/hasher.env.EXAMPLE
@@ -1,6 +1,10 @@
 # This is an EXAMPLE file, do not put real secrets in here.
 # Copy it to ../config/hasher.env and then DELETE THIS COMMENT.
-HASHER_API_AZ_CLIENT_ID=
-HASHER_API_AZ_CLIENT_PASSWORD=
-HASHER_API_AZ_TENANT_ID=
-HASHER_API_AZ_KEY_VAULT_NAME=
+AZURE_CLIENT_ID=
+AZURE_CLIENT_SECRET=
+AZURE_TENANT_ID=
+AZURE_KEY_VAULT_NAME=
+# This is the "variable name" of the actual secret,
+# and can be fixed and is not secret itself
+AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key"
+
diff --git a/docker-compose.yml b/docker-compose.yml
index d20153a..56ecbb8 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -47,7 +47,7 @@ services:
         HTTPS_PROXY: ${HTTPS_PROXY}
         https_proxy: ${https_proxy}
     ports:
-      - "127.0.0.1:${HASHER_API_PORT}:8000"
+      - "127.0.0.1:${HASHER_API_LISTEN_PORT}:8000"
     env_file:
       - ../config/hasher.env
     restart: unless-stopped
diff --git a/pyproject.toml b/pyproject.toml
index 7ee53db..bb0d238 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ dependencies = [
     "snakemake==9.14.5",
     # need to be compatible with PIXL, which currently pins 2.9.10 (arguably it shouldn't)
     "psycopg2-binary>=2.9.10",
-    "stablehash==0.3.0",
+    "requests==2.32.3",
     # trick for making a "relative" path, works inside or outside container image
     "core @ file:///${PROJECT_ROOT}/../PIXL/pixl_core",
 ]
diff --git a/src/pseudon/hashing.py b/src/pseudon/hashing.py
index d7ac7b9..35bde98 100644
--- a/src/pseudon/hashing.py
+++ b/src/pseudon/hashing.py
@@ -1,17 +1,36 @@
+import logging
 from functools import lru_cache
-from stablehash import stablehash
 
+import requests
 
-@lru_cache
+import settings
+
+logger = logging.getLogger(__name__)
+
+
+@lru_cache(maxsize=1000)
 def do_hash(type_prefix: str, value: str):
-    """Stub implementation of deidentification function for testing purposes.
+    """Pass data to the hasher API for de-identification purposes.
 
     Not that I think this will happen in practice, but we'd want the CSN "1234" to hash
     to a different value than the MRN "1234", so prefix each value with its type.
     """
-    # Full implementation of issue #6 must remove this code and call the real hasher!!
-    SALT = "waveform-exporter"
-    full_value_to_hash = f"{SALT}:{type_prefix}:{value}"
-    full_hash = stablehash(full_value_to_hash).hexdigest()
-    tiny_hash = full_hash[:8]
-    return tiny_hash
+
+    project_slug = "waveform-exporter"
+    full_value_to_hash = f"{type_prefix}:{value}"
+
+    hasher_hostname = settings.HASHER_API_HOSTNAME
+    hasher_port = settings.HASHER_API_PORT
+    hasher_req_url = f"http://{hasher_hostname}:{hasher_port}/hash"
+    request_params: dict[str, str | int] = {
+        "project_slug": project_slug,
+        "message": full_value_to_hash,
+    }
+    # do we need to specify a particular hash length?
+    # request_params["length"] = hash_len
+
+    response = requests.get(hasher_req_url, params=request_params)
+    logger.debug("RESPONSE = {}", response.text)
+    response.raise_for_status()
+    real_hash = response.text
+    return real_hash
diff --git a/src/settings.py b/src/settings.py
index dfbebc1..8aaed85 100644
--- a/src/settings.py
+++ b/src/settings.py
@@ -29,3 +29,6 @@ def get_from_env(env_var, *, default_value=None, setting_name=None):
 get_from_env("FTPS_PORT", default_value=990)
 get_from_env("FTPS_USERNAME")
 get_from_env("FTPS_PASSWORD")
+
+get_from_env("HASHER_API_HOSTNAME")
+get_from_env("HASHER_API_PORT")

From a1e63ab37256369da1708ac53860c7e34af46282 Mon Sep 17 00:00:00 2001
From: Jeremy Stein <j.stein@ucl.ac.uk>
Date: Fri, 23 Jan 2026 16:56:07 +0000
Subject: [PATCH 2/6] We don't actually need to listen on a specified host port
 because we access directly by service name on the docker network. Just let it
 pick an ephemeral one.

---
 README.md                   | 4 ----
 config.EXAMPLE/.env.EXAMPLE | 5 -----
 docker-compose.yml          | 3 ++-
 3 files changed, 2 insertions(+), 10 deletions(-)
 delete mode 100644 config.EXAMPLE/.env.EXAMPLE

diff --git a/README.md b/README.md
index 77dc783..97d29d7 100644
--- a/README.md
+++ b/README.md
@@ -60,16 +60,12 @@ mkdir config
 cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env
 cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env
 cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env
-cp waveform-controller/config.EXAMPLE/.env.EXAMPLE waveform-controller/.env
 ```
 From the new config files, remove the comments telling you not to put secrets in it, as instructed.
 
 #### fill in config files
 Fill out the config, as appropriate.
 
-Tip: HASHER_API_LISTEN_PORT should match HASHER_API_PORT if you are running your own instance of the
-PIXL hasher (as things stand, we are doing so).
-
 When updating to a new version of this code, you should diff the .EXAMPLE file against its live version,
 eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`.
 
diff --git a/config.EXAMPLE/.env.EXAMPLE b/config.EXAMPLE/.env.EXAMPLE
deleted file mode 100644
index bd473cc..0000000
--- a/config.EXAMPLE/.env.EXAMPLE
+++ /dev/null
@@ -1,5 +0,0 @@
-# This is an EXAMPLE file, do not put real secrets in here.
-# Copy it to .env and then DELETE THIS COMMENT.
-# (This is a bit different from the others, which live in config.
-# This one is just for the variables in the docker compose file)
-HASHER_API_LISTEN_PORT=
diff --git a/docker-compose.yml b/docker-compose.yml
index 56ecbb8..6c30ca7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -47,7 +47,8 @@ services:
         HTTPS_PROXY: ${HTTPS_PROXY}
         https_proxy: ${https_proxy}
     ports:
-      - "127.0.0.1:${HASHER_API_LISTEN_PORT}:8000"
+      # this is only here as a convenience for testing, we don't actually use it from the exporter
+      - "127.0.0.1::8000"
     env_file:
       - ../config/hasher.env
     restart: unless-stopped

From 7da0c5f208952667f5fc9f64599794ce2a2510a0 Mon Sep 17 00:00:00 2001
From: Jeremy Stein <j.stein@ucl.ac.uk>
Date: Fri, 23 Jan 2026 17:00:18 +0000
Subject: [PATCH 3/6] Specify hasher port as default

---
 config.EXAMPLE/exporter.env.EXAMPLE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config.EXAMPLE/exporter.env.EXAMPLE b/config.EXAMPLE/exporter.env.EXAMPLE
index 14a601d..134ac38 100644
--- a/config.EXAMPLE/exporter.env.EXAMPLE
+++ b/config.EXAMPLE/exporter.env.EXAMPLE
@@ -11,4 +11,4 @@ FTPS_PASSWORD=
 SNAKEMAKE_RULE_UNTIL=
 # point to the hasher we wish to use
 HASHER_API_HOSTNAME=waveform-hasher
-HASHER_API_PORT=
+HASHER_API_PORT=8000

From e6aaf337e9ed4a7121e8e7da6cef2fc1b3dd669e Mon Sep 17 00:00:00 2001
From: Jeremy Stein <j.stein@ucl.ac.uk>
Date: Fri, 23 Jan 2026 17:20:03 +0000
Subject: [PATCH 4/6] Fix linting

---
 config.EXAMPLE/hasher.env.EXAMPLE | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE
index ad79b7a..0631345 100644
--- a/config.EXAMPLE/hasher.env.EXAMPLE
+++ b/config.EXAMPLE/hasher.env.EXAMPLE
@@ -6,5 +6,4 @@ AZURE_TENANT_ID=
 AZURE_KEY_VAULT_NAME=
 # This is the "variable name" of the actual secret,
 # and can be fixed and is not secret itself
-AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key"
-
+AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" # pragma: allowlist secret

From 8f3c455abdb9322ece197e597ab6cb1b97b665bc Mon Sep 17 00:00:00 2001
From: Jeremy Stein <j.stein@ucl.ac.uk>
Date: Mon, 26 Jan 2026 16:24:39 +0000
Subject: [PATCH 5/6] Doc tweak

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 97d29d7..0afc24a 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ Set up the config files as follows:
 ```
 mkdir config
 cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env
-cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env
+cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/exporter.env
 cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env
 ```
 From the new config files, remove the comments telling you not to put secrets in it, as instructed.

From eb2c16a75d0610da63f3f4b7d5081da71d62f489 Mon Sep 17 00:00:00 2001
From: Jeremy Stein <j.stein@ucl.ac.uk>
Date: Fri, 30 Jan 2026 15:21:23 +0000
Subject: [PATCH 6/6] Document how to config the hasher

---
 README.md                         |  6 ++--
 config.EXAMPLE/hasher.env.EXAMPLE |  6 ++++
 docs/azure_hashing.md             | 48 +++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 docs/azure_hashing.md

diff --git a/README.md b/README.md
index 0afc24a..a7df500 100644
--- a/README.md
+++ b/README.md
@@ -63,9 +63,11 @@ cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env
 ```
 From the new config files, remove the comments telling you not to put secrets in it, as instructed.
 
-#### fill in config files
+#### Fill in config files
 Fill out the config, as appropriate.
 
+See [azure and hasher setup](docs/azure_hashing.md) to configure the hasher.
+
 When updating to a new version of this code, you should diff the .EXAMPLE file against its live version,
 eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`.
 
@@ -86,7 +88,7 @@ mkdir waveform-export
 
 #### run it!
 
-Build and start the controller and exporter with docker
+Build and start the hasher, controller and exporter with docker.
 ```
 cd waveform-controller
 docker compose build
diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE
index 0631345..efeae85 100644
--- a/config.EXAMPLE/hasher.env.EXAMPLE
+++ b/config.EXAMPLE/hasher.env.EXAMPLE
@@ -1,9 +1,15 @@
 # This is an EXAMPLE file, do not put real secrets in here.
 # Copy it to ../config/hasher.env and then DELETE THIS COMMENT.
+# Details for the Azure service principal, so it can log in to the keyvault.
+# aka "appId"
 AZURE_CLIENT_ID=
+# aka "password"
 AZURE_CLIENT_SECRET=
+# aka "tenant"
 AZURE_TENANT_ID=
+# the name of the key vault, NOT the service principal
 AZURE_KEY_VAULT_NAME=
+
 # This is the "variable name" of the actual secret,
 # and can be fixed and is not secret itself
 AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" # pragma: allowlist secret
diff --git a/docs/azure_hashing.md b/docs/azure_hashing.md
new file mode 100644
index 0000000..43898fb
--- /dev/null
+++ b/docs/azure_hashing.md
@@ -0,0 +1,48 @@
+# Setting up Azure + Hashing in Dev and Production
+
+# Create and configure Azure key vaults
+
+Azure key vaults for dev and prod already exist. Ask your team-mates how to find the details for these.
+
+For each, there is an Azure service principal (aka. machine account) that can read/write secrets to
+the key vault.
+
+# Configure hasher
+The hasher needs to be given the service principal details so it can create/obtain the
+secrets. It also needs to know the name of the manually-created secret (see next section for more details).
+
+See [hasher example config](config.EXAMPLE/hasher.env.EXAMPLE) for detailed description of required env vars.
+
+# Manual Azure config
+
+There is a one-off (per key vault) step that needs to be performed manually.
+
+First, install the Azure CLI tools in the usual way for your OS.
+
+Log in using the service principal.
+Do not include password on command line; let it prompt you and then paste it in.
+```
+az login --service-principal --username <APP_ID> --tenant <TENANT_ID>
+```
+
+Now you can run commands to inspect the existing setup:
+```
+# show all keyvaults
+az keyvault list
+
+# Show keyvault details (not secrets). name is "name" key from previous command
+az keyvault show --name <keyvault_name>
+
+# list all secrets in keyvault
+az keyvault secret list --vault-name <keyvault_name>
+```
+As per [PIXL instructions](https://github.com/SAFEHR-data/PIXL/blob/main/docs/setup/azure-keyvault.md#step-4),
+you need to manually create a secret project-level key:
+```
+az keyvault secret set --vault-name <keyvault_name> --name <secret_name> --value <secret_value>
+```
+Note that you can choose the name of this secret (`<secret_name>` above), and its name (NOT its value)
+should be placed in the config env var `AZURE_KEY_VAULT_SECRET_NAME`
+
+In addition, the PIXL hasher automatically creates a secret named after the "project slug" that you pass
+in, the first time that you request a hash using that project slug.