SAFEHR-data · jeremyestein · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,6 @@ wheels/
 
 # settings files (should not be in the source tree anyway, but just in case)
 *.env
+
+# snakemake tracking files
+.snakemake
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,7 +33,8 @@ repos:
           [
             "pandas-stubs",
             "types-psycopg2",
-            "types-pika"
+            "types-pika",
+            "types-requests",
           ]
         files: src/
   # a collection of sanity checks: check for merge conflicts, check the end of

diff --git a/README.md b/README.md
@@ -48,18 +48,36 @@ separate to the Emap project root.
 
 ### Instructions for achieving this structure
 
+
+#### Clone repos
 Clone this repo (`waveform-controller`) and [PIXL](https://github.com/SAFEHR-data/PIXL),
 both inside your root directory.
 
+#### make config files
 Set up the config files as follows:
 ```
 mkdir config
 cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env
-cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env
+cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/exporter.env
 cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env
 ```
 From the new config files, remove the comments telling you not to put secrets in it, as instructed.
 
+#### Fill in config files
+Fill out the config, as appropriate.
+
+See [azure and hasher setup](docs/azure_hashing.md) to configure the hasher.
+
+When updating to a new version of this code, you should diff the .EXAMPLE file against its live version,
+eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`.
+
+This checks if any config options have been added/removed from the .EXAMPLE, and thus should be
+added/removed from the live file.
+
+> [!CAUTION]
+> Be careful not to copy sensitive data from the live config file to the .EXAMPLE file!
+
+#### make necessary directories
 If it doesn't already exist you should create a directory named
 `waveform-export` in the parent directory to store the saved waveform
 messages.
@@ -68,7 +86,9 @@ messages.
 mkdir waveform-export
 ```
 
-Build and start the controller and exporter with docker
+#### run it!
+
+Build and start the hasher, controller and exporter with docker.
 ```
 cd waveform-controller
 docker compose build

diff --git a/config.EXAMPLE/exporter.env.EXAMPLE b/config.EXAMPLE/exporter.env.EXAMPLE
@@ -2,7 +2,13 @@
 # Copy it to ../config/exporter.env and then DELETE THIS COMMENT.
 # When does the exporter run
 EXPORTER_CRON_SCHEDULE="14 5 * * *"
+# Where to upload via FTPS
 FTPS_HOST=myftps.example.com
 FTPS_PORT=990
 FTPS_USERNAME=
 FTPS_PASSWORD=
+# only run workflow up to and including the specified rule
+SNAKEMAKE_RULE_UNTIL=
+# point to the hasher we wish to use
+HASHER_API_HOSTNAME=waveform-hasher
+HASHER_API_PORT=8000
diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE
@@ -1,6 +1,15 @@
 # This is an EXAMPLE file, do not put real secrets in here.
 # Copy it to ../config/hasher.env and then DELETE THIS COMMENT.
-HASHER_API_AZ_CLIENT_ID=
-HASHER_API_AZ_CLIENT_PASSWORD=
-HASHER_API_AZ_TENANT_ID=
-HASHER_API_AZ_KEY_VAULT_NAME=
+# Details for the Azure service principal, so it can log in to the keyvault.
+# aka "appId"
+AZURE_CLIENT_ID=
+# aka "password"
+AZURE_CLIENT_SECRET=
+# aka "tenant"
+AZURE_TENANT_ID=
+# the name of the key vault, NOT the service principal
+AZURE_KEY_VAULT_NAME=
+
+# This is the "variable name" of the actual secret,
+# and can be fixed and is not secret itself
+AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" # pragma: allowlist secret
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -47,7 +47,8 @@ services:
         HTTPS_PROXY: ${HTTPS_PROXY}
         https_proxy: ${https_proxy}
     ports:
-      - "127.0.0.1:${HASHER_API_PORT}:8000"
+      # this is only here as a convenience for testing, we don't actually use it from the exporter
+      - "127.0.0.1::8000"
     env_file:
       - ../config/hasher.env
     restart: unless-stopped
diff --git a/docs/azure_hashing.md b/docs/azure_hashing.md
@@ -0,0 +1,48 @@
+# Setting up Azure + Hashing in Dev and Production
+
+# Create and configure Azure key vaults
+
+Azure key vaults for dev and prod already exist. Ask your team-mates how to find the details for these.
+
+For each, there is an Azure service principal (aka. machine account) that can read/write secrets to
+the key vault.
+
+# Configure hasher
+The hasher needs to be given the service principal details so it can create/obtain the
+secrets. It also needs to know the name of the manually-created secret (see next section for more details).
+
+See [hasher example config](config.EXAMPLE/hasher.env.EXAMPLE) for detailed description of required env vars.
+
+# Manual Azure config
+
+There is a one-off (per key vault) step that needs to be performed manually.
+
+First, install the Azure CLI tools in the usual way for your OS.
+
+Log in using the service principal.
+Do not include password on command line; let it prompt you and then paste it in.
+```
+az login --service-principal --username <APP_ID> --tenant <TENANT_ID>
+```
+
+Now you can run commands to inspect the existing setup:
+```
+# show all keyvaults
+az keyvault list
+
+# Show keyvault details (not secrets). name is "name" key from previous command
+az keyvault show --name <keyvault_name>
+
+# list all secrets in keyvault
+az keyvault secret list --vault-name <keyvault_name>
+```
+As per [PIXL instructions](https://github.com/SAFEHR-data/PIXL/blob/main/docs/setup/azure-keyvault.md#step-4),
+you need to manually create a secret project-level key:
+```
+az keyvault secret set --vault-name <keyvault_name> --name <secret_name> --value <secret_value>
+```
+Note that you can choose the name of this secret (`<secret_name>` above), and its name (NOT its value)
+should be placed in the config env var `AZURE_KEY_VAULT_SECRET_NAME`
+
+In addition, the PIXL hasher automatically creates a secret named after the "project slug" that you pass
+in, the first time that you request a hash using that project slug.
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ dependencies = [
     "snakemake==9.14.5",
     # need to be compatible with PIXL, which currently pins 2.9.10 (arguably it shouldn't)
     "psycopg2-binary>=2.9.10",
-    "stablehash==0.3.0",
+    "requests==2.32.3",
     # trick for making a "relative" path, works inside or outside container image
     "core @ file:///${PROJECT_ROOT}/../PIXL/pixl_core",
 ]

diff --git a/src/pseudon/hashing.py b/src/pseudon/hashing.py
@@ -1,17 +1,36 @@
+import logging
 from functools import lru_cache
-from stablehash import stablehash
 
+import requests
 
-@lru_cache
+import settings
+
+logger = logging.getLogger(__name__)
+
+
+@lru_cache(maxsize=1000)
 def do_hash(type_prefix: str, value: str):
-    """Stub implementation of deidentification function for testing purposes.
+    """Pass data to the hasher API for de-identification purposes.
 
     Not that I think this will happen in practice, but we'd want the CSN "1234" to hash
     to a different value than the MRN "1234", so prefix each value with its type.
     """
-    # Full implementation of issue #6 must remove this code and call the real hasher!!
-    SALT = "waveform-exporter"
-    full_value_to_hash = f"{SALT}:{type_prefix}:{value}"
-    full_hash = stablehash(full_value_to_hash).hexdigest()
-    tiny_hash = full_hash[:8]
-    return tiny_hash
+
+    project_slug = "waveform-exporter"
+    full_value_to_hash = f"{type_prefix}:{value}"
+
+    hasher_hostname = settings.HASHER_API_HOSTNAME
+    hasher_port = settings.HASHER_API_PORT
+    hasher_req_url = f"http://{hasher_hostname}:{hasher_port}/hash"
+    request_params: dict[str, str | int] = {
+        "project_slug": project_slug,
+        "message": full_value_to_hash,
+    }
+    # do we need to specify a particular hash length?
+    # request_params["length"] = hash_len
+
+    response = requests.get(hasher_req_url, params=request_params)
+    logger.debug("RESPONSE = {}", response.text)
+    response.raise_for_status()
+    real_hash = response.text
+    return real_hash
diff --git a/src/settings.py b/src/settings.py
@@ -29,3 +29,6 @@ def get_from_env(env_var, *, default_value=None, setting_name=None):
 get_from_env("FTPS_PORT", default_value=990)
 get_from_env("FTPS_USERNAME")
 get_from_env("FTPS_PASSWORD")
+
+get_from_env("HASHER_API_HOSTNAME")
+get_from_env("HASHER_API_PORT")