diff --git a/.gitignore b/.gitignore index be3f5ce..a44b7dd 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ wheels/ # settings files (should not be in the source tree anyway, but just in case) *.env + +# snakemake tracking files +.snakemake diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4b0fd9..a217d81 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,8 @@ repos: [ "pandas-stubs", "types-psycopg2", - "types-pika" + "types-pika", + "types-requests", ] files: src/ # a collection of sanity checks: check for merge conflicts, check the end of diff --git a/README.md b/README.md index 4743482..a7df500 100644 --- a/README.md +++ b/README.md @@ -48,18 +48,36 @@ separate to the Emap project root. ### Instructions for achieving this structure + +#### Clone repos Clone this repo (`waveform-controller`) and [PIXL](https://github.com/SAFEHR-data/PIXL), both inside your root directory. +#### make config files Set up the config files as follows: ``` mkdir config cp waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env -cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/settings.env +cp waveform-controller/config.EXAMPLE/exporter.env.EXAMPLE config/exporter.env cp waveform-controller/config.EXAMPLE/hasher.env.EXAMPLE config/hasher.env ``` From the new config files, remove the comments telling you not to put secrets in it, as instructed. +#### Fill in config files +Fill out the config, as appropriate. + +See [azure and hasher setup](docs/azure_hashing.md) to configure the hasher. + +When updating to a new version of this code, you should diff the .EXAMPLE file against its live version, +eg. by running `vimdiff waveform-controller/config.EXAMPLE/controller.env.EXAMPLE config/controller.env`. + +This checks if any config options have been added/removed from the .EXAMPLE, and thus should be +added/removed from the live file. + +> [!CAUTION] +> Be careful not to copy sensitive data from the live config file to the .EXAMPLE file! + +#### make necessary directories If it doesn't already exist you should create a directory named `waveform-export` in the parent directory to store the saved waveform messages. @@ -68,7 +86,9 @@ messages. mkdir waveform-export ``` -Build and start the controller and exporter with docker +#### run it! + +Build and start the hasher, controller and exporter with docker. ``` cd waveform-controller docker compose build diff --git a/config.EXAMPLE/exporter.env.EXAMPLE b/config.EXAMPLE/exporter.env.EXAMPLE index 7be714c..134ac38 100644 --- a/config.EXAMPLE/exporter.env.EXAMPLE +++ b/config.EXAMPLE/exporter.env.EXAMPLE @@ -2,7 +2,13 @@ # Copy it to ../config/exporter.env and then DELETE THIS COMMENT. # When does the exporter run EXPORTER_CRON_SCHEDULE="14 5 * * *" +# Where to upload via FTPS FTPS_HOST=myftps.example.com FTPS_PORT=990 FTPS_USERNAME= FTPS_PASSWORD= +# only run workflow up to and including the specified rule +SNAKEMAKE_RULE_UNTIL= +# point to the hasher we wish to use +HASHER_API_HOSTNAME=waveform-hasher +HASHER_API_PORT=8000 diff --git a/config.EXAMPLE/hasher.env.EXAMPLE b/config.EXAMPLE/hasher.env.EXAMPLE index 1cd6ad0..efeae85 100644 --- a/config.EXAMPLE/hasher.env.EXAMPLE +++ b/config.EXAMPLE/hasher.env.EXAMPLE @@ -1,6 +1,15 @@ # This is an EXAMPLE file, do not put real secrets in here. # Copy it to ../config/hasher.env and then DELETE THIS COMMENT. -HASHER_API_AZ_CLIENT_ID= -HASHER_API_AZ_CLIENT_PASSWORD= -HASHER_API_AZ_TENANT_ID= -HASHER_API_AZ_KEY_VAULT_NAME= +# Details for the Azure service principal, so it can log in to the keyvault. +# aka "appId" +AZURE_CLIENT_ID= +# aka "password" +AZURE_CLIENT_SECRET= +# aka "tenant" +AZURE_TENANT_ID= +# the name of the key vault, NOT the service principal +AZURE_KEY_VAULT_NAME= + +# This is the "variable name" of the actual secret, +# and can be fixed and is not secret itself +AZURE_KEY_VAULT_SECRET_NAME="waveform-secret-key" # pragma: allowlist secret diff --git a/docker-compose.yml b/docker-compose.yml index d20153a..6c30ca7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -47,7 +47,8 @@ services: HTTPS_PROXY: ${HTTPS_PROXY} https_proxy: ${https_proxy} ports: - - "127.0.0.1:${HASHER_API_PORT}:8000" + # this is only here as a convenience for testing, we don't actually use it from the exporter + - "127.0.0.1::8000" env_file: - ../config/hasher.env restart: unless-stopped diff --git a/docs/azure_hashing.md b/docs/azure_hashing.md new file mode 100644 index 0000000..43898fb --- /dev/null +++ b/docs/azure_hashing.md @@ -0,0 +1,48 @@ +# Setting up Azure + Hashing in Dev and Production + +# Create and configure Azure key vaults + +Azure key vaults for dev and prod already exist. Ask your team-mates how to find the details for these. + +For each, there is an Azure service principal (aka. machine account) that can read/write secrets to +the key vault. + +# Configure hasher +The hasher needs to be given the service principal details so it can create/obtain the +secrets. It also needs to know the name of the manually-created secret (see next section for more details). + +See [hasher example config](config.EXAMPLE/hasher.env.EXAMPLE) for detailed description of required env vars. + +# Manual Azure config + +There is a one-off (per key vault) step that needs to be performed manually. + +First, install the Azure CLI tools in the usual way for your OS. + +Log in using the service principal. +Do not include password on command line; let it prompt you and then paste it in. +``` +az login --service-principal --username --tenant +``` + +Now you can run commands to inspect the existing setup: +``` +# show all keyvaults +az keyvault list + +# Show keyvault details (not secrets). name is "name" key from previous command +az keyvault show --name + +# list all secrets in keyvault +az keyvault secret list --vault-name +``` +As per [PIXL instructions](https://github.com/SAFEHR-data/PIXL/blob/main/docs/setup/azure-keyvault.md#step-4), +you need to manually create a secret project-level key: +``` +az keyvault secret set --vault-name --name --value +``` +Note that you can choose the name of this secret (`` above), and its name (NOT its value) +should be placed in the config env var `AZURE_KEY_VAULT_SECRET_NAME` + +In addition, the PIXL hasher automatically creates a secret named after the "project slug" that you pass +in, the first time that you request a hash using that project slug. diff --git a/pyproject.toml b/pyproject.toml index 7ee53db..bb0d238 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ dependencies = [ "snakemake==9.14.5", # need to be compatible with PIXL, which currently pins 2.9.10 (arguably it shouldn't) "psycopg2-binary>=2.9.10", - "stablehash==0.3.0", + "requests==2.32.3", # trick for making a "relative" path, works inside or outside container image "core @ file:///${PROJECT_ROOT}/../PIXL/pixl_core", ] diff --git a/src/pseudon/hashing.py b/src/pseudon/hashing.py index d7ac7b9..35bde98 100644 --- a/src/pseudon/hashing.py +++ b/src/pseudon/hashing.py @@ -1,17 +1,36 @@ +import logging from functools import lru_cache -from stablehash import stablehash +import requests -@lru_cache +import settings + +logger = logging.getLogger(__name__) + + +@lru_cache(maxsize=1000) def do_hash(type_prefix: str, value: str): - """Stub implementation of deidentification function for testing purposes. + """Pass data to the hasher API for de-identification purposes. Not that I think this will happen in practice, but we'd want the CSN "1234" to hash to a different value than the MRN "1234", so prefix each value with its type. """ - # Full implementation of issue #6 must remove this code and call the real hasher!! - SALT = "waveform-exporter" - full_value_to_hash = f"{SALT}:{type_prefix}:{value}" - full_hash = stablehash(full_value_to_hash).hexdigest() - tiny_hash = full_hash[:8] - return tiny_hash + + project_slug = "waveform-exporter" + full_value_to_hash = f"{type_prefix}:{value}" + + hasher_hostname = settings.HASHER_API_HOSTNAME + hasher_port = settings.HASHER_API_PORT + hasher_req_url = f"http://{hasher_hostname}:{hasher_port}/hash" + request_params: dict[str, str | int] = { + "project_slug": project_slug, + "message": full_value_to_hash, + } + # do we need to specify a particular hash length? + # request_params["length"] = hash_len + + response = requests.get(hasher_req_url, params=request_params) + logger.debug("RESPONSE = {}", response.text) + response.raise_for_status() + real_hash = response.text + return real_hash diff --git a/src/settings.py b/src/settings.py index dfbebc1..8aaed85 100644 --- a/src/settings.py +++ b/src/settings.py @@ -29,3 +29,6 @@ def get_from_env(env_var, *, default_value=None, setting_name=None): get_from_env("FTPS_PORT", default_value=990) get_from_env("FTPS_USERNAME") get_from_env("FTPS_PASSWORD") + +get_from_env("HASHER_API_HOSTNAME") +get_from_env("HASHER_API_PORT")