# The default style for rendering JSON parsed as Python dicts isn't the best.
# Use this import and call `print(json)` when we want a cleaner view.
from rich import print
# Status bars for long-running cells
from tqdm.notebook import trange, tqdmFHIR Bulk Data Technical Example
This module covers the process of connecting to a FHIR server, using its Bulk Data API to request an export, and converting the returned FHIR-formatted data into a tabular data structure in Python.
1 Background
The Bulk Data Access standard enables researchers to retrieve large volumes of data from a patient population in an EHR. The Bulk Data Access standard is part of the SMART ecosystem, and SMART on FHIR can be used to authenticate and authorize applications that retrieve bulk data automatically.
Clients of FHIR Bulk Data servers use SMART Backend Authorization to connect to the server. With SMART Backend Authorization, registered clients make a signed request to a token endpoint to receive a Bearer token, which they use for subsequent calls to the FHIR server.
Client registration often happens manually as a separate one-time event. The SMART Backend Authorization specification does not define an API for registration.
In this example, we connect to the SMART Bulk Data Server (https://bulk-data.smarthealthit.org). This is a developer tool provided by SMART Health IT to facilitate development with Bulk Data Access. This test server allows clients to “register” on the launch page by providing either a URL for a JSON Web Key Set (JWKS) or a raw JWKS. In this case, “registration” is not stored on the server. Instead, the FHIR Server URL contains the “registration” information stored as state in the URL and client ID. Production servers will usually have a more standard registration process rather than taking this approach.
For convenience, the SMART Bulk Data Server launch page allows users to generate a one-off JWKS to use for testing. For production usage, clients must generate their own certificates and JWKS and keep the private key private. For convenience, here we use a JWKS generated by the launch page.
IMPORTANT: this module is not meant to be formal documentation for the FHIR Bulk Data specification, and it largely skips error handling and stays on the “happy path” for brevity and readability. We strongly recommend reviewing the specifications and adding error handling before using any of this code in a production environment.
2 Getting an Access Token
The first step in obtaining data from a FHIR server that supports Bulk Data Access is to obtain an access token. That access token identifies and authorizes the client on requests made to the FHIR resource server.
Obtaining an access token is itself a two-step process:
- Make a discovery request to the FHIR resource server to get the address of the authorization server.
- Post a token request, signed by the client’s private key, to the authorization server.
To keep the focus of this module on the Bulk Data process rather than the details of generating keys, we will use a JWKS pre-generated by the SMART Bulk Data example server launch page.
For reference, we followed the steps below to generate the keys used in this exercise:
- Visit the SMART Bulk Data Server launch page
- In the upper left, click the
JWKSbutton for Authentication - Click the
Generatebutton and chooseGenerate RS384 - Choose
R4for the FHIR Version - The associated text box now contains a JWKS with both a public and private key, and the Launch Configuration contains a FHIR Server URL and Client ID
- Convert the private key from the JWKS to “PEM” format so it can be used by Python (this is not easy to do natively in Python, so we have done it with JavaScript out of band)
Let’s start by defining our credentials. In practice, real credentials must always be stored and loaded securely, but for simplicity in this module we will define them as local variables below.
client_id = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6InJlZ2lzdHJhdGlvbi10b2tlbiJ9.eyJqd2tzIjp7ImtleXMiOlt7Imt0eSI6IlJTQSIsImFsZyI6IlJTMzg0IiwibiI6IngzMDc2RTJNaUpMR3JPbXJXRjZXSWZ1RjFSZDBlTjBSdEhUSVRuMlNGVWhMYTFQWE5Ia0xBR2xSSmtJWk1QMUk5SEhxdTRERy02d2JraFMweU9GbEZhZE1iaGgzcHkySHoybDctRmg1M3Y3bmpwb3dxUGV2eEpqMlpEQU5BanFWeHRLOGdvMm1BZmZFSnJ2ZkVHbm5oUGkzdGE1U2U5UTBkS29la2hJRVRCaVJTa0ozN0pobEZGSDh3S2hFLXVwaXBQU3VycTBrQ0JkNlNaS3NOVHpHNzJmLVJoNENiREZWTVdfRm5zcTh5LWRJMTdMSDJZcHBBLWc0eGlUZnMwMGZOUG9FUEdoWFU2bHFKMHMwclp4Um9zYnVuV0NTYi1UaEtWV0RyeUFudE83S3dWN1BxVG1NMmVrVS1yenZFaWprVjZfUUlnVTJxRTd6X1k1N1l4aW8zUSIsImUiOiJBUUFCIiwia2V5X29wcyI6WyJ2ZXJpZnkiXSwiZXh0Ijp0cnVlLCJraWQiOiI0ZDc3OTJjZTQyMDU0ZDVkZjhkZDg1ZjhiNTI3ZGQ4OCJ9LHsia3R5IjoiUlNBIiwiYWxnIjoiUlMzODQiLCJuIjoieDMwNzZFMk1pSkxHck9tcldGNldJZnVGMVJkMGVOMFJ0SFRJVG4yU0ZVaExhMVBYTkhrTEFHbFJKa0laTVAxSTlISHF1NERHLTZ3YmtoUzB5T0ZsRmFkTWJoaDNweTJIejJsNy1GaDUzdjduanBvd3FQZXZ4SmoyWkRBTkFqcVZ4dEs4Z28ybUFmZkVKcnZmRUdubmhQaTN0YTVTZTlRMGRLb2VraElFVEJpUlNrSjM3SmhsRkZIOHdLaEUtdXBpcFBTdXJxMGtDQmQ2U1pLc05Uekc3MmYtUmg0Q2JERlZNV19GbnNxOHktZEkxN0xIMllwcEEtZzR4aVRmczAwZk5Qb0VQR2hYVTZscUowczByWnhSb3NidW5XQ1NiLVRoS1ZXRHJ5QW50TzdLd1Y3UHFUbU0yZWtVLXJ6dkVpamtWNl9RSWdVMnFFN3pfWTU3WXhpbzNRIiwiZSI6IkFRQUIiLCJkIjoiUnptQWRTMlMtb1FsS1VGNHF1R0Npdm1KekE1R3lJeHRzTmR0V1JEZVluamdiSjZQbksyRzd3dXJMSlMyOTlYSEFYZld6a0ZwU2h3bDc5OHl1UEk0ckNXQ1ZXQ29fLWh5ci14Q2xlWEpCWVJQV292VXljODlVMTBsdzVtZ1cyWmRhWkotT2NLblBkYWZreERLME1wdkhmdkxZN09zd1lkX2Z4UHFQRTd3ZDlaQU5XLUIyWmNURUVmd2taNWdlcmtDdnFHQ1lEUTdVcVJqR3k1dWRjTkRiQ01ITFdGaEZZMTVqMDVMMFpJV0RwUDY2cmN6UWZEdnduR0pIbWxJbnJMbTl5WkowUTNkVlpHSmo2Y2dMeWI4WHhkNHpWRjZGSy1NX2VKbnFzZFRveHRPMDNUOVotSWlrN1BfbFBheWRvMWRycXRZdUxmZXpvU1lnUGp0V0NnV0JRIiwicCI6IjZwNlV5aGZiQ0JjQlEzcGttMHZEb1lqSDZsc1FCeS1PTzlEYlpfZnFfSHpzZl96UWhENDdua0dZZngxbGVTUFlQU0ZSeDlRTUR3cTlvYWxjYmEwNmE3QTVmMUxQNVpaRnNvSDVCTElHTUcxNmhDbW1mTEdRMURkZ3pMb2s3Q3RldDRnNGhUTlpseFZOYV9uYVNmZGJSdmQycF8zNTM1RGpaOXoyMEpSNllDYyIsInEiOiIyYXNhQ0RCTmY3NTQ1ajdOcXI2TTZiUW8wVGZEWGNlb2FxcGVtNGhpNE1pYUtBOEcydVFvdXNTOGcyUTlZOFZiZmxjX3I2WmxPVjIxSmJhYW5WN253MDRxbVpqMG5Xdkk0a19yX2lKWTVuSDNUMHk0Y0lGV21tLUhPY1dzazJXWl9QQ1NSc1piOU1qOUs4UXh6b1h5WEo0ck9aLUw4OTNZbDZ5bVdKa2xqVnMiLCJkcCI6Ik9LeWI5b0Z5dUc2T01KV2xMZHBNWkgzZEJPQ0FhNnZ5S01MWDdUSjNBZ3pQT0UtQ3N4OHhXWll3MXl2cnNpcVZkcGJRNFh0NGVqMjI5eEVwTVpreHpvZWdMQUItRmRDSl80Zmo5bDFtbjFZaXpVQWVabXFpT0pFMEFlQkpRUDlzX3RxYUJKc1YzaWdZTHFnSk1lcmRrclAtWnJBMEp1d2g4cG51eVEzRXplcyIsImRxIjoib2I2R0FvMjZHUEcxcnduLUZDR3lYanMwbFhzRlhwdHRaNDJmN1owa05IcDhLc1kzeHRJQl9mOFJRZVZyeE1hem5TZENPTWpCc1NZVDVLbFRMUnVIeHRZX3k1RWdQQllLMlRpZ1dXQzJoTTh0QWEwMTVNd0hTWTBVZ19hQ3JhaXpDNFRNZlhFS2hkUVFaTVJPYW5PWVRBQndpRW9wV2hhQXl2eE5ROHJSWDc4IiwicWkiOiJLSjhJU0RKaHVyUmEyTVRHdG4zWjR3NU9ob3o2N29OcE10MG1TakxGUEt0QjFWbjRaZ3VkTUxfWTZ4V2lWTnBOR1hQa3hoMEJjRmNKakNKcC0yeUZLV0d4Si14M2JMWVllbkVUaGRFSGRRR0xuUUszMHlEdHFTY2NDUVY5U2xGc281NUdnUmxhODNaY2NBZTdBMXBWN2sxRGE4dFVFNkE4TXNlQ1ZXamRLbFUiLCJrZXlfb3BzIjpbInNpZ24iXSwiZXh0Ijp0cnVlLCJraWQiOiI0ZDc3OTJjZTQyMDU0ZDVkZjhkZDg1ZjhiNTI3ZGQ4OCJ9XX0sImFjY2Vzc1Rva2Vuc0V4cGlyZUluIjoxNSwiaWF0IjoxNjg2NjUyNzM4fQ.j1urst068-21CxiH0Nqml7XoE9v6hWJ_vfqAK4W22vg'
# Don't worry! This is not anybody's real private key. It was
# generated specifically and only for this exercise.
private_key = """-----BEGIN RSA PRIVATE KEY-----
MIIEowIBAAKCAQEAx3076E2MiJLGrOmrWF6WIfuF1Rd0eN0RtHTITn2SFUhLa1PX
NHkLAGlRJkIZMP1I9HHqu4DG+6wbkhS0yOFlFadMbhh3py2Hz2l7+Fh53v7njpow
qPevxJj2ZDANAjqVxtK8go2mAffEJrvfEGnnhPi3ta5Se9Q0dKoekhIETBiRSkJ3
7JhlFFH8wKhE+upipPSurq0kCBd6SZKsNTzG72f+Rh4CbDFVMW/Fnsq8y+dI17LH
2YppA+g4xiTfs00fNPoEPGhXU6lqJ0s0rZxRosbunWCSb+ThKVWDryAntO7KwV7P
qTmM2ekU+rzvEijkV6/QIgU2qE7z/Y57Yxio3QIDAQABAoIBAEc5gHUtkvqEJSlB
eKrhgor5icwORsiMbbDXbVkQ3mJ44Gyej5ythu8LqyyUtvfVxwF31s5BaUocJe/f
MrjyOKwlglVgqP/ocq/sQpXlyQWET1qL1MnPPVNdJcOZoFtmXWmSfjnCpz3Wn5MQ
ytDKbx37y2OzrMGHf38T6jxO8HfWQDVvgdmXExBH8JGeYHq5Ar6hgmA0O1KkYxsu
bnXDQ2wjBy1hYRWNeY9OS9GSFg6T+uq3M0Hw78JxiR5pSJ6y5vcmSdEN3VWRiY+n
IC8m/F8XeM1RehSvjP3iZ6rHU6MbTtN0/WfiIpOz/5T2snaNXa6rWLi33s6EmID4
7VgoFgUCgYEA6p6UyhfbCBcBQ3pkm0vDoYjH6lsQBy+OO9DbZ/fq/Hzsf/zQhD47
nkGYfx1leSPYPSFRx9QMDwq9oalcba06a7A5f1LP5ZZFsoH5BLIGMG16hCmmfLGQ
1DdgzLok7Ctet4g4hTNZlxVNa/naSfdbRvd2p/3535DjZ9z20JR6YCcCgYEA2asa
CDBNf7545j7Nqr6M6bQo0TfDXceoaqpem4hi4MiaKA8G2uQousS8g2Q9Y8Vbflc/
r6ZlOV21JbaanV7nw04qmZj0nWvI4k/r/iJY5nH3T0y4cIFWmm+HOcWsk2WZ/PCS
RsZb9Mj9K8QxzoXyXJ4rOZ+L893Yl6ymWJkljVsCgYA4rJv2gXK4bo4wlaUt2kxk
fd0E4IBrq/IowtftMncCDM84T4KzHzFZljDXK+uyKpV2ltDhe3h6Pbb3ESkxmTHO
h6AsAH4V0In/h+P2XWafViLNQB5maqI4kTQB4ElA/2z+2poEmxXeKBguqAkx6t2S
s/5msDQm7CHyme7JDcTN6wKBgQChvoYCjboY8bWvCf4UIbJeOzSVewVem21njZ/t
nSQ0enwqxjfG0gH9/xFB5WvExrOdJ0I4yMGxJhPkqVMtG4fG1j/LkSA8FgrZOKBZ
YLaEzy0BrTXkzAdJjRSD9oKtqLMLhMx9cQqF1BBkxE5qc5hMAHCISilaFoDK/E1D
ytFfvwKBgCifCEgyYbq0WtjExrZ92eMOToaM+u6DaTLdJkoyxTyrQdVZ+GYLnTC/
2OsVolTaTRlz5MYdAXBXCYwiaftshSlhsSfsd2y2GHpxE4XRB3UBi50Ct9Mg7akn
HAkFfUpRbKOeRoEZWvN2XHAHuwNaVe5NQ2vLVBOgPDLHglVo3SpV
-----END RSA PRIVATE KEY-----"""
# note key id is the "kid" field from the JWKS -- it's same
# for both values of `keys`
key_id = "4d7792ce42054d5df8dd85f8b527dd88"
server_url = 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWwiOjB9/fhir'We will use the Requests library for making all HTTP requests, and use a Session, in case we need to persist common settings such as proxy or SSL configuration.
import requests
session = requests.Session()
# Optional: Turn off SSL verification. Useful when dealing with a
# corporate proxy with self-signed certificates.
#
# from urllib3.exceptions import InsecureRequestWarning
# requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
# session.verify = FalseLet’s start by confirming we can hit the server via the /metadata endpoint. When connecting to a server for the first time it is generally a good idea to review the metadata to see what the server supports, and that it matches your expectations. In this case, expect to see the name “SMART Sample Bulk Data Server”, and references to “export” operations.
r = session.get(f'{server_url}/metadata')
metadata = r.json()You can call print(metadata) to print out the raw JSON, but this can be unwieldy to navigate. If you are working in a Jupyter notebook or similar environment, this provides a collapsible view into JSON:
from IPython.display import JSON
JSON(metadata)The SMART Backend Authorization specification defines that the token endpoint will be published as part of the FHIR resource server’s SMART metadata, at .well-known/smart-configuration. Let’s fetch that endpoint and review the contents.
r = session.get(f'{server_url}/.well-known/smart-configuration')
smart_config = r.json()
JSON(smart_config)We care most about the token_endpoint field, which we need to request our JWT. For more information about the other fields, see here.
token_endpoint = smart_config['token_endpoint']Now we have our token endpoint, so we can make a request to it to get a token. The request follows the OAuth 2.0 “Client Credentials” flow, using a JSON Web Token (JWT) assertion containing our client ID and signed with our private key.
📘 Read more about the access token request specification
# Create a JWT client assertion as follows:
import jwt
import datetime
assertion = jwt.encode({
'iss': client_id, # "iss" == "issuer", the client that created this JWT
'sub': client_id, # "sub" == "subject", the client that will use the access token
'aud': token_endpoint, # "aud" == "audience", the receiver of this request
'exp': int((datetime.datetime.now() + datetime.timedelta(minutes=5)).timestamp())
},
private_key, # signed with the private key
algorithm='RS384', # algorithm for the key
headers={"kid": key_id}) # kid is required for smart bulk data server
# And then POST it to the token endpoint
r = session.post(token_endpoint, data={
'scope': 'system/*.read',
'grant_type': 'client_credentials',
'client_assertion_type': 'urn:ietf:params:oauth:client-assertion-type:jwt-bearer',
'client_assertion': assertion
})
token_response = r.json()
# And inspect the response:
print(token_response){ 'token_type': 'bearer', 'scope': 'system/*.read', 'expires_in': 300, 'access_token': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoiYmVhcmVyIiwic2NvcGUiOiJzeXN0ZW0vKi5yZWFkIiwiZXhwaXJlc19pb iI6MzAwLCJpYXQiOjE3ODAzMzAxNTcsImV4cCI6MTc4MDMzMDQ1N30.xYKlHqt_X6Kx0Q9kSW0z5QdlTzPJwz_N_QtBmDMev2w' }
Two important fields we need to keep track of are the token itself and the expiration time. Tokens are only valid for a certain amount of time, and once they expire we will need to fetch a new one via the same process as above. expires_in is in seconds from the current time, so we’ll add that to the current time to get a timestamp we can compare against.
Note that for this example we requested and received 'scope': 'system/*.read' which allows access to all resource types. In practice, requesting access to all resource types is generally not recommended, and servers do not always support asking for * scopes. Generally it is recommended to request only the minimal level of access necessary.
token = token_response['access_token']
expire_time = datetime.datetime.now() + datetime.timedelta(seconds=token_response['expires_in'])To make this easier for ourselves, let’s package this up into a get_token() function that we can call anytime we need to use a token. If the current token is still valid, use that, or if it has expired, fetch a new one. The logic is exactly the same as the previous steps we just ran:
def get_token():
global token, expire_time
if datetime.datetime.now() < expire_time:
# the existing token is still valid so return it
return token
assertion = jwt.encode({
'iss': client_id,
'sub': client_id,
'aud': token_endpoint,
'exp': int((datetime.datetime.now() + datetime.timedelta(minutes=5)).timestamp())
}, private_key, algorithm='RS384',
headers={"kid": key_id})
r = session.post(token_endpoint, data={
'scope': 'system/*.read',
'grant_type': 'client_credentials',
'client_assertion_type': 'urn:ietf:params:oauth:client-assertion-type:jwt-bearer',
'client_assertion': assertion
})
token_response = r.json()
token = token_response['access_token']
expire_time = datetime.datetime.now() + datetime.timedelta(seconds=token_response['expires_in'])
return token3 Starting, Checking, and Downloading the Export
Now that we have an access token, the next step in using Bulk Data is to request the export of data, via a “kick-off request”. This is an asynchronous request – once the request is accepted, instead of returning the results directly, the server response will point to a URL where the client can check the status.
There are three levels of export:
- Patient, to obtain resources related to all Patients
- Group, to obtain resources for patients associated with a particular Group
- System, to obtain all resources, whether or not they are associated with a patient
For this exercise we will initially only request Patient-level data, but the general process for Group- and System-level data is exactly the same; there is just a different endpoint to hit, and a different set of data will be returned.
There are also a number of parameters that may be set, but to keep things simple we will only use the _type parameter, to request only Patient and Condition resource types.
📘 Read more about the Bulk Data Kick-off Request
Let’s make the export request and inspect the response headers. For “Patient” level data, the URL we want to hit is {server}/Patient/$export. Our token is used in the “Authorization” header in the format "Bearer {token}".
r = session.get(f'{server_url}/Patient/$export?_type=Patient,Condition',
headers={'Authorization': f'Bearer {get_token()}',
'Accept': 'application/fhir+json',
'Prefer': 'respond-async'})
# Convert `r.headers` to vanilla `dict` to improve readability of print output
print(dict(r.headers)){ 'Content-Length': '644', 'Content-Location': 'https://bulk-data.smarthealthit.org/fhir/bulkstatus/fdb82c62e1cf08e4f1d459c3eae7e18c', 'Content-Type': 'application/json; charset=utf-8', 'Date': 'Mon, 01 Jun 2026 16:09:17 GMT', 'Etag': 'W/"284-JkYZZvdLlIXC1H0J3WzPf648+wk"', 'Nel': '{"report_to":"heroku-nel","response_headers":["Via"],"max_age":3600,"success_fraction":0.01,"failure_fraction":0.1 }', 'Report-To': '{"group":"heroku-nel","endpoints":[{"url":"https://nel.heroku.com/reports?s=IKA5tbAzYC91XYF%2FAhTyI6yZJ2VVe15xkWxL %2FCev4DM%3D\\u0026sid=67ff5de4-ad2b-4112-9289-cf96be89efed\\u0026ts=1780330157"}],"max_age":3600}', 'Reporting-Endpoints': 'heroku-nel="https://nel.heroku.com/reports?s=IKA5tbAzYC91XYF%2FAhTyI6yZJ2VVe15xkWxL%2FCev4DM%3D&sid=67ff5de4-ad2b- 4112-9289-cf96be89efed&ts=1780330157"', 'Server': 'Heroku', 'Via': '1.1 heroku-router', 'X-Powered-By': 'Express' }
We see the status URL in the Content-Location header, so let’s save that into a variable.
check_url = r.headers['Content-Location']We can now check the status by getting that URL, and the HTTP status code of the response will indicate the export status.
- Code 200 means the export is complete, and the response body will indicate the location
- Code 202 means the export is still in progress
- Codes in the range 4xx-5xx indicate an error has occurred. 4xx codes generally indicate an error in the request, and 5xx codes generally indicate a server error.
Note that in production environments it is recommended to check the status as infrequently as possible, to minimize the load on the server. In this case we expect the export to complete in just a few seconds so the impact of checking every two seconds is minimal. The server will also include a “Retry-After” header which will give us a hint on how long to wait before trying again. We’ll check that status in a loop, and break out of the loop when we get a complete or error response. We’ll print status each time through the loop, and the response body when the export is complete.
# Now we check the status in a loop
from time import sleep
while True:
r = session.get(check_url, headers={'Authorization': f'Bearer {get_token()}', 'Accept': 'application/json'})
if r.status_code == 200:
# complete
response = r.json()
print(response)
break
elif r.status_code == 202:
# in progress
print(r.headers)
delay = r.headers['Retry-After']
print(f"Sleeping {delay} seconds before retrying")
sleep(int(delay))
else:
# error
print(r.text)
break{'Content-Length': '0', 'Date': 'Mon, 01 Jun 2026 16:09:17 GMT', 'Nel': '{"report_to":"heroku-nel","response_headers":["Via"],"max_age":3600,"success_fraction":0.01,"failure_fraction":0.1 }', 'Report-To': '{"group":"heroku-nel","endpoints":[{"url":"https://nel.heroku.com/reports?s=IKA5tbAzYC91XYF%2FAhTyI6yZJ2VVe15xkWxL %2FCev4DM%3D\\u0026sid=67ff5de4-ad2b-4112-9289-cf96be89efed\\u0026ts=1780330157"}],"max_age":3600}', 'Reporting-Endpoints': 'heroku-nel="https://nel.heroku.com/reports?s=IKA5tbAzYC91XYF%2FAhTyI6yZJ2VVe15xkWxL%2FCev4DM%3D&sid=67ff5de4-ad2b- 4112-9289-cf96be89efed&ts=1780330157"', 'Retry-After': '1', 'Server': 'Heroku', 'Via': '1.1 heroku-router', 'X-Powered-By': 'Express', 'X-Progress': '0% complete, Please wait...'}
Sleeping 1 seconds before retrying
{ 'transactionTime': '2026-06-01T16:09:17+00:00', 'request': 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZW wiOjB9/fhir/Patient/$export?_type=Patient,Condition', 'requiresAccessToken': True, 'outputOrganizedBy': '', 'deleted': [], 'output': [ { 'url': 'https://bulk-data.smarthealthit.org/eyJpZCI6ImZkYjgyYzYyZTFjZjA4ZTRmMWQ0NTljM2VhZTdlMThjIiwic2VjdXJlIjp0cnVlLCJsaW 1pdCI6NjM5LCJvZmZzZXQiOjAsInN0cmF0aWZpZXIiOiJDb25kaXRpb24ifQ/fhir/bulkfiles/2.Condition.ndjson', 'count': 639, 'type': 'Condition' }, { 'url': 'https://bulk-data.smarthealthit.org/eyJpZCI6ImZkYjgyYzYyZTFjZjA4ZTRmMWQ0NTljM2VhZTdlMThjIiwic2VjdXJlIjp0cnVlLCJsaW 1pdCI6MTAwLCJvZmZzZXQiOjAsInN0cmF0aWZpZXIiOiJQYXRpZW50In0/fhir/bulkfiles/4.Patient.ndjson', 'count': 100, 'type': 'Patient' } ], 'error': [] }
We can see that the response points us to one or more NDJSON (Newline Delimited JSON) files per resource type, in the output field of the response.
Note that in this case the volume of data is relatively small, and there is only one entry in the list per resource type, but for large datasets it is possible that there could be multiple files (and therefore multiple entries in this list) per resource type.
Let’s save that list to a variable.
output_files = response['output']
print(output_files)[ { 'url': 'https://bulk-data.smarthealthit.org/eyJpZCI6ImZkYjgyYzYyZTFjZjA4ZTRmMWQ0NTljM2VhZTdlMThjIiwic2VjdXJlIjp0cnVlLCJsaW 1pdCI6NjM5LCJvZmZzZXQiOjAsInN0cmF0aWZpZXIiOiJDb25kaXRpb24ifQ/fhir/bulkfiles/2.Condition.ndjson', 'count': 639, 'type': 'Condition' }, { 'url': 'https://bulk-data.smarthealthit.org/eyJpZCI6ImZkYjgyYzYyZTFjZjA4ZTRmMWQ0NTljM2VhZTdlMThjIiwic2VjdXJlIjp0cnVlLCJsaW 1pdCI6MTAwLCJvZmZzZXQiOjAsInN0cmF0aWZpZXIiOiJQYXRpZW50In0/fhir/bulkfiles/4.Patient.ndjson', 'count': 100, 'type': 'Patient' } ]
Now we can loop through the list and download each one. Each file is an NDJSON, so that means we’ll see one resource per line.
To make each step clear and distinct, we’ll keep a dict of { resourceType: [resources,...]} which we can process later.
Note: for this exercise we are only reading the NDJSON files into a dict in memory, but in practice you may want to save the file locally first in case there are errors in processing, especially if the files are large.
import json
resources_by_type = {}
for output_file in tqdm(output_files):
download_url = output_file['url']
resource_type = output_file['type']
r = session.get(download_url, headers={'Authorization': f'Bearer {get_token()}',
'Accept': 'application/fhir+ndjson'})
ndjson = r.text.strip() # remove any whitespace, in particular trailing newlines
if resource_type not in resources_by_type:
resources_by_type[resource_type] = []
# NDJSON can't be parsed as a whole, we have to process it line-by-line
for line in ndjson.split('\n'):
resource = json.loads(line)
resources_by_type[resource_type].append(resource)
# This is a large amount of JSON data, only uncomment this line if you care to review
# print(resources_by_type)4 Converting to DataFrames
Finally, let’s convert these into DataFrames.
The quick-and-dirty option is to use the Pandas json_normalize() function to parse a list of dicts into a DataFrame.
📘 Read more about pandas.json_normalize
import pandas as pd
resource_dfs = {}
for resource_type, resources in resources_by_type.items():
resource_dfs[resource_type] = pd.json_normalize(resources)
# Now we can work with them by type:
resource_dfs['Patient']| resourceType | id | extension | identifier | name | telecom | gender | birthDate | address | multipleBirthBoolean | communication | text.status | text.div | maritalStatus.coding | maritalStatus.text | multipleBirthInteger | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Patient | 58c297c4-d684-4677-8024-01131d93835e | [{'url': 'http://hl7.org/fhir/StructureDefinit... | [{'system': 'https://github.com/synthetichealt... | [{'use': 'official', 'family': 'Wintheiser', '... | [{'system': 'phone', 'value': '555-712-4709', ... | female | 1971-04-05 | [{'extension': [{'url': 'http://hl7.org/fhir/S... | False | [{'language': {'coding': [{'system': 'urn:ietf... | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | [{'system': 'http://terminology.hl7.org/CodeSy... | M | NaN |
| 1 | Patient | 588675dc-e80e-4528-a78f-af10f9755f23 | [{'url': 'http://hl7.org/fhir/StructureDefinit... | [{'system': 'https://github.com/synthetichealt... | [{'use': 'official', 'family': 'Bergstrom', 'g... | [{'system': 'phone', 'value': '555-593-7481', ... | male | 2015-10-04 | [{'extension': [{'url': 'http://hl7.org/fhir/S... | False | [{'language': {'coding': [{'system': 'urn:ietf... | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | [{'system': 'http://terminology.hl7.org/CodeSy... | Never Married | NaN |
| 2 | Patient | 118616a4-f0b2-411f-8050-39d5d27c738c | [{'url': 'http://hl7.org/fhir/StructureDefinit... | [{'system': 'https://github.com/synthetichealt... | [{'use': 'official', 'family': 'Conroy', 'give... | [{'system': 'phone', 'value': '555-324-6732', ... | male | 1995-05-04 | [{'extension': [{'url': 'http://hl7.org/fhir/S... | False | [{'language': {'coding': [{'system': 'urn:ietf... | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | [{'system': 'http://terminology.hl7.org/CodeSy... | Never Married | NaN |
| 3 | Patient | c852042b-1373-45e6-acb5-f252c733de3a | [{'url': 'http://hl7.org/fhir/StructureDefinit... | [{'system': 'https://github.com/synthetichealt... | [{'use': 'official', 'family': 'Yundt', 'given... | [{'system': 'phone', 'value': '555-620-3747', ... | male | 1991-08-15 | [{'extension': [{'url': 'http://hl7.org/fhir/S... | False | [{'language': {'coding': [{'system': 'urn:ietf... | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | [{'system': 'http://terminology.hl7.org/CodeSy... | M | NaN |
| 4 | Patient | 21fba439-ca79-411f-a081-37a432a78f3a | [{'url': 'http://hl7.org/fhir/StructureDefinit... | [{'system': 'https://github.com/synthetichealt... | [{'use': 'official', 'family': 'Bruen', 'given... | [{'system': 'phone', 'value': '555-288-1632', ... | female | 1964-12-09 | [{'extension': [{'url': 'http://hl7.org/fhir/S... | False | [{'language': {'coding': [{'system': 'urn:ietf... | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | [{'system': 'http://terminology.hl7.org/CodeSy... | M | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | Patient | daf4e787-0ea5-45ff-a9a1-c68308e9f6a3 | [{'url': 'http://hl7.org/fhir/StructureDefinit... | [{'system': 'https://github.com/synthetichealt... | [{'use': 'official', 'family': 'Feil', 'given'... | [{'system': 'phone', 'value': '555-936-3951', ... | male | 1995-06-21 | [{'extension': [{'url': 'http://hl7.org/fhir/S... | False | [{'language': {'coding': [{'system': 'urn:ietf... | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | [{'system': 'http://terminology.hl7.org/CodeSy... | Never Married | NaN |
| 96 | Patient | ffa66c0a-87b0-4b78-bb7e-3a5c7c44d359 | [{'url': 'http://hl7.org/fhir/StructureDefinit... | [{'system': 'https://github.com/synthetichealt... | [{'use': 'official', 'family': 'Keebler', 'giv... | [{'system': 'phone', 'value': '555-107-3617', ... | male | 1926-06-21 | [{'extension': [{'url': 'http://hl7.org/fhir/S... | False | [{'language': {'coding': [{'system': 'urn:ietf... | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | [{'system': 'http://terminology.hl7.org/CodeSy... | S | NaN |
| 97 | Patient | c2f03d5b-7219-4dec-bcc6-03ee42166769 | [{'url': 'http://hl7.org/fhir/StructureDefinit... | [{'system': 'https://github.com/synthetichealt... | [{'use': 'official', 'family': 'Medhurst', 'gi... | [{'system': 'phone', 'value': '555-513-5799', ... | female | 1969-11-09 | [{'extension': [{'url': 'http://hl7.org/fhir/S... | False | [{'language': {'coding': [{'system': 'urn:ietf... | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | [{'system': 'http://terminology.hl7.org/CodeSy... | S | NaN |
| 98 | Patient | 39699402-f976-4fa0-8635-f015a54f1f8b | [{'url': 'http://hl7.org/fhir/StructureDefinit... | [{'system': 'https://github.com/synthetichealt... | [{'use': 'official', 'family': 'Leuschke', 'gi... | [{'system': 'phone', 'value': '555-772-9913', ... | male | 2003-03-02 | [{'extension': [{'url': 'http://hl7.org/fhir/S... | NaN | [{'language': {'coding': [{'system': 'urn:ietf... | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | [{'system': 'http://terminology.hl7.org/CodeSy... | Never Married | 3.0 |
| 99 | Patient | 7cdaae04-4ce7-4a6d-b6a3-5cddba9bc888 | [{'url': 'http://hl7.org/fhir/StructureDefinit... | [{'system': 'https://github.com/synthetichealt... | [{'use': 'official', 'family': 'Little', 'give... | [{'system': 'phone', 'value': '555-114-3914', ... | female | 1959-07-23 | [{'extension': [{'url': 'http://hl7.org/fhir/S... | False | [{'language': {'coding': [{'system': 'urn:ietf... | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | [{'system': 'http://terminology.hl7.org/CodeSy... | M | NaN |
100 rows × 16 columns
This works, but it’s clearly not ideal in how it handles nested fields, such as the nested lists of the name field. One way we can do a little better is with the flatten_json library: https://github.com/amirziai/flatten
from flatten_json import flatten
for resource_type, resources in resources_by_type.items():
resource_dfs[resource_type] = pd.json_normalize(list(map(lambda r: flatten(r), resources)))
# Now let's take another look
resource_dfs['Patient']| resourceType | id | text_status | text_div | extension_0_url | extension_0_valueString | extension_1_url | extension_1_valueAddress_city | extension_1_valueAddress_state | extension_1_valueAddress_country | ... | maritalStatus_coding_0_system | maritalStatus_coding_0_code | maritalStatus_coding_0_display | maritalStatus_text | multipleBirthBoolean | communication_0_language_coding_0_system | communication_0_language_coding_0_code | communication_0_language_coding_0_display | communication_0_language_text | multipleBirthInteger | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Patient | 58c297c4-d684-4677-8024-01131d93835e | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | http://hl7.org/fhir/StructureDefinition/patien... | Marquetta Schamberger | http://hl7.org/fhir/StructureDefinition/patien... | Macau | Macao Special Administrative Region of the Peo... | CN | ... | http://terminology.hl7.org/CodeSystem/v3-Marit... | M | M | M | False | urn:ietf:bcp:47 | zh | Chinese | Chinese | NaN |
| 1 | Patient | 588675dc-e80e-4528-a78f-af10f9755f23 | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | http://hl7.org/fhir/StructureDefinition/patien... | Ashely Considine | http://hl7.org/fhir/StructureDefinition/patien... | Boston | Massachusetts | US | ... | http://terminology.hl7.org/CodeSystem/v3-Marit... | S | Never Married | Never Married | False | urn:ietf:bcp:47 | en-US | English | English | NaN |
| 2 | Patient | 118616a4-f0b2-411f-8050-39d5d27c738c | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | http://hl7.org/fhir/StructureDefinition/patien... | Nelda Monahan | http://hl7.org/fhir/StructureDefinition/patien... | Quincy | Massachusetts | US | ... | http://terminology.hl7.org/CodeSystem/v3-Marit... | S | Never Married | Never Married | False | urn:ietf:bcp:47 | en-US | English | English | NaN |
| 3 | Patient | c852042b-1373-45e6-acb5-f252c733de3a | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | http://hl7.org/fhir/StructureDefinition/patien... | Nell Berge | http://hl7.org/fhir/StructureDefinition/patien... | Agawam | Massachusetts | US | ... | http://terminology.hl7.org/CodeSystem/v3-Marit... | M | M | M | False | urn:ietf:bcp:47 | en-US | English | English | NaN |
| 4 | Patient | 21fba439-ca79-411f-a081-37a432a78f3a | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | http://hl7.org/fhir/StructureDefinition/patien... | Jerilyn Zieme | http://hl7.org/fhir/StructureDefinition/patien... | Amherst | Massachusetts | US | ... | http://terminology.hl7.org/CodeSystem/v3-Marit... | M | M | M | False | urn:ietf:bcp:47 | en-US | English | English | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | Patient | daf4e787-0ea5-45ff-a9a1-c68308e9f6a3 | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | http://hl7.org/fhir/StructureDefinition/patien... | Doris Lubowitz | http://hl7.org/fhir/StructureDefinition/patien... | Boston | Massachusetts | US | ... | http://terminology.hl7.org/CodeSystem/v3-Marit... | S | Never Married | Never Married | False | urn:ietf:bcp:47 | en-US | English | English | NaN |
| 96 | Patient | ffa66c0a-87b0-4b78-bb7e-3a5c7c44d359 | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | http://hl7.org/fhir/StructureDefinition/patien... | Mallie Kautzer | http://hl7.org/fhir/StructureDefinition/patien... | Millbury | Massachusetts | US | ... | http://terminology.hl7.org/CodeSystem/v3-Marit... | S | S | S | False | urn:ietf:bcp:47 | en-US | English | English | NaN |
| 97 | Patient | c2f03d5b-7219-4dec-bcc6-03ee42166769 | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | http://hl7.org/fhir/StructureDefinition/patien... | Jamie Bogisich | http://hl7.org/fhir/StructureDefinition/patien... | Charlton | Massachusetts | US | ... | http://terminology.hl7.org/CodeSystem/v3-Marit... | S | S | S | False | urn:ietf:bcp:47 | en-US | English | English | NaN |
| 98 | Patient | 39699402-f976-4fa0-8635-f015a54f1f8b | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | http://hl7.org/fhir/StructureDefinition/patien... | Lauri Kuhn | http://hl7.org/fhir/StructureDefinition/patien... | Lowell | Massachusetts | US | ... | http://terminology.hl7.org/CodeSystem/v3-Marit... | S | Never Married | Never Married | NaN | urn:ietf:bcp:47 | en-US | English | English | 3.0 |
| 99 | Patient | 7cdaae04-4ce7-4a6d-b6a3-5cddba9bc888 | generated | <div xmlns="http://www.w3.org/1999/xhtml">Gene... | http://hl7.org/fhir/StructureDefinition/patien... | Ming Herman | http://hl7.org/fhir/StructureDefinition/patien... | Lawrence | Massachusetts | US | ... | http://terminology.hl7.org/CodeSystem/v3-Marit... | M | M | M | False | urn:ietf:bcp:47 | en-US | English | English | NaN |
100 rows × 73 columns
Let’s look at just one row so it’s easier to see all the columns and an example value:
with pd.option_context('display.max_rows', 1000, 'display.max_columns', 10):
print(resource_dfs['Patient'].loc[0].T)resourceType Patient id 58c297c4-d684-4677-8024-01131d93835e text_status generated text_div <div xmlns="http://www.w3.org/1999/xhtml">Gene... extension_0_url http://hl7.org/fhir/StructureDefinition/patien... extension_0_valueString Marquetta Schamberger extension_1_url http://hl7.org/fhir/StructureDefinition/patien... extension_1_valueAddress_city Macau extension_1_valueAddress_state Macao Special Administrative Region of the Peo... extension_1_valueAddress_country CN extension_2_url http://synthetichealth.github.io/synthea/disab... extension_2_valueDecimal 9.931319 extension_3_url http://synthetichealth.github.io/synthea/quali... extension_3_valueDecimal 38.068681 identifier_0_system https://github.com/synthetichealth/synthea identifier_0_value 58c297c4-d684-4677-8024-01131d93835e identifier_1_type_coding_0_system http://terminology.hl7.org/CodeSystem/v2-0203 identifier_1_type_coding_0_code MR identifier_1_type_coding_0_display Medical Record Number identifier_1_type_text Medical Record Number identifier_1_system http://hospital.smarthealthit.org identifier_1_value 58c297c4-d684-4677-8024-01131d93835e identifier_2_type_coding_0_system http://terminology.hl7.org/CodeSystem/v2-0203 identifier_2_type_coding_0_code SS identifier_2_type_coding_0_display Social Security Number identifier_2_type_text Social Security Number identifier_2_system http://hl7.org/fhir/sid/us-ssn identifier_2_value 999-33-6284 identifier_3_type_coding_0_system http://terminology.hl7.org/CodeSystem/v2-0203 identifier_3_type_coding_0_code DL identifier_3_type_coding_0_display Driver's License identifier_3_type_text Driver's License identifier_3_system urn:oid:2.16.840.1.113883.4.3.25 identifier_3_value S99959553 identifier_4_type_coding_0_system http://terminology.hl7.org/CodeSystem/v2-0203 identifier_4_type_coding_0_code PPN identifier_4_type_coding_0_display Passport Number identifier_4_type_text Passport Number identifier_4_system http://standardhealthrecord.org/fhir/Structure... identifier_4_value X69743686X name_0_use official name_0_family Wintheiser name_0_given_0 Aleta name_0_prefix_0 Mrs. name_1_use maiden name_1_family Heathcote name_1_given_0 Aleta name_1_prefix_0 Mrs. telecom_0_system phone telecom_0_value 555-712-4709 telecom_0_use home gender female birthDate 1971-04-05 address_0_extension_0_url http://hl7.org/fhir/StructureDefinition/geoloc... address_0_extension_0_extension_0_url latitude address_0_extension_0_extension_0_valueDecimal 42.469724 address_0_extension_0_extension_1_url longitude address_0_extension_0_extension_1_valueDecimal -71.09393 address_0_line_0 850 Kertzmann Heights address_0_city Everett address_0_state Massachusetts address_0_postalCode 02149 address_0_country US maritalStatus_coding_0_system http://terminology.hl7.org/CodeSystem/v3-Marit... maritalStatus_coding_0_code M maritalStatus_coding_0_display M maritalStatus_text M multipleBirthBoolean False communication_0_language_coding_0_system urn:ietf:bcp:47 communication_0_language_coding_0_code zh communication_0_language_coding_0_display Chinese communication_0_language_text Chinese multipleBirthInteger NaN Name: 0, dtype: object
Next, what if we know in advance we will only want certain fields?
For this, we will use SAS’s Python implementation of SQL on FHIR. SQL on FHIR uses FHIRPath to identify which elements we want to extract and map these onto the desired column name in the DataFrame.
a path based navigation and extraction language, somewhat like XPath. Operations are expressed in terms of the logical content of hierarchical data models, and support traversal, selection and filtering of data.
If you are not familiar with FHIRPath, Section 3 of the FHIRPath spec describes some of the basics.
If we want the record identifier, sex, date of birth, and marital status, we would need to construct the following FHIRPaths:
| Target FHIR Element | FHIRPath | Destination Column |
|---|---|---|
| Record ID | getResourceKey() |
id |
| Sex | gender* |
sex |
| Date of birth | birthDate |
birth_date |
| Marital status | maritalStatus.coding.first().code |
marital_status |
* Technically we should use the US Core Individual Sex extension for extracting the sex, but since our example data does not include that extension, we use gender as a fallback.
We then need to create a SQL on FHIR ViewDefinition to operationalize this mapping.
import sqlonfhir
patient_view = {
"resource": "Patient",
"select": [
{
"column": [
{"name": "id", "path": "getResourceKey()"},
{"name": "sex", "path": "gender"},
{"name": "birth_date", "path": "birthDate"},
{"name": "marital_status", "path": "maritalStatus.coding.first().code"},
]
},
],
}
pd.DataFrame(sqlonfhir.evaluate(resources, patient_view))| id | sex | birth_date | marital_status | |
|---|---|---|---|---|
| 0 | 58c297c4-d684-4677-8024-01131d93835e | female | 1971-04-05 | M |
| 1 | 588675dc-e80e-4528-a78f-af10f9755f23 | male | 2015-10-04 | S |
| 2 | 118616a4-f0b2-411f-8050-39d5d27c738c | male | 1995-05-04 | S |
| 3 | c852042b-1373-45e6-acb5-f252c733de3a | male | 1991-08-15 | M |
| 4 | 21fba439-ca79-411f-a081-37a432a78f3a | female | 1964-12-09 | M |
| ... | ... | ... | ... | ... |
| 95 | daf4e787-0ea5-45ff-a9a1-c68308e9f6a3 | male | 1995-06-21 | S |
| 96 | ffa66c0a-87b0-4b78-bb7e-3a5c7c44d359 | male | 1926-06-21 | S |
| 97 | c2f03d5b-7219-4dec-bcc6-03ee42166769 | female | 1969-11-09 | S |
| 98 | 39699402-f976-4fa0-8635-f015a54f1f8b | male | 2003-03-02 | S |
| 99 | 7cdaae04-4ce7-4a6d-b6a3-5cddba9bc888 | female | 1959-07-23 | M |
100 rows × 4 columns
5 Bringing it all together
Now we have everything we need to connect to a FHIR server that supports Bulk Data, request and download exported data, and convert it into a DataFrame. Let’s bring everything together from the previous steps into one class with a clear entry point.
import datetime
import json
from collections import defaultdict
from time import sleep
from typing import Any, Optional
import jwt
import pandas as pd
import requests
import sqlonfhir
class ViewDefinitions(dict[str, Any]):
"""
Helper class to keep track of ViewDefinitions used in the BulkDataFetcher class.
This class allows you to have something like:
bulk_data_fetcher.view_definitions.patient = <Patient ViewDefinition goes here>
After you run this, you will have:
bulk_data_fetcher.view_definitions = {"Patient": <Patient ViewDefinition goes here>}
"""
@staticmethod
def _resource_type_from_name(name: str) -> str:
# Store Python-friendly names like patient or medication_request
# as FHIR resource names.
if "_" in name or name.islower():
return "".join(part[:1].upper() + part[1:] for part in name.split("_"))
return name
def __getattr__(self, name: str) -> Any:
key = self._resource_type_from_name(name)
try:
return super().__getitem__(key)
except KeyError:
raise AttributeError(f"No view definition for {key}") from None
def __setattr__(self, name: str, value: Any) -> None:
if name.startswith("_"):
object.__setattr__(self, name, value)
return
self[name] = value
def __contains__(self, key: object) -> bool:
# Mapping.__contains__ accepts any object, so only normalize
# actual strings.
if not isinstance(key, str):
return False
return super().__contains__(self._resource_type_from_name(key))
def __getitem__(self, resource_type: str) -> Any:
return super().__getitem__(self._resource_type_from_name(resource_type))
def __setitem__(self, resource_type: str, view_definition: Any) -> None:
resource_type = self._resource_type_from_name(resource_type)
super().__setitem__(resource_type, view_definition)
class BulkDataFetcher:
def __init__(
self,
base_url: str,
client_id: str,
private_key: str,
key_id: str,
endpoint: Optional[str] = None,
session: Optional[requests.Session] = None,
):
self.base_url = base_url
self.client_id = client_id
self.private_key = private_key
self.key_id = key_id
self.token = None
self.expire_time = None
if endpoint is None:
self.endpoint = "Patient"
else:
self.endpoint = endpoint
if session is None:
self.session = requests.Session()
else:
self.session = session
r = self.session.get(f"{base_url}/.well-known/smart-configuration")
smart_config = r.json()
self.token_endpoint = smart_config["token_endpoint"]
self.view_definitions = ViewDefinitions()
self.downloaded_resources = defaultdict(list)
def get_token(self):
if self.token and datetime.datetime.now() < self.expire_time:
# the existing token is still valid so use it
return self.token
assertion = jwt.encode(
{
"iss": self.client_id,
"sub": self.client_id,
"aud": self.token_endpoint,
"exp": int(
(
datetime.datetime.now() + datetime.timedelta(minutes=5)
).timestamp()
),
},
self.private_key,
algorithm="RS384",
headers={"kid": self.key_id},
)
r = self.session.post(
self.token_endpoint,
data={
"scope": "system/*.read",
"grant_type": "client_credentials",
"client_assertion_type": "urn:ietf:params:oauth:client-assertion-type:jwt-bearer",
"client_assertion": assertion,
},
)
token_response = r.json()
self.token = token_response["access_token"]
self.expire_time = datetime.datetime.now() + datetime.timedelta(
seconds=token_response["expires_in"]
)
return self.token
def _start_bulk_data_export(self):
types = ",".join(self.view_definitions.keys())
url = f"{self.base_url}/{self.endpoint}/$export?_type={types}"
print(f"Fetching from {url}")
r = self.session.get(
url,
headers={
"Authorization": f"Bearer {self.get_token()}",
"Accept": "application/fhir+json",
"Prefer": "respond-async",
},
)
self.check_url = r.headers["Content-Location"]
return self.check_url
def _wait_for_bulk_data_export_to_be_ready_for_download(self):
while True:
r = self.session.get(
self.check_url,
headers={
"Authorization": f"Bearer {self.get_token()}",
"Accept": "application/json",
},
)
# There are three possible options here: http://hl7.org/fhir/uv/bulkdata/export.html#bulk-data-status-request
# Error = 4xx or 5xx status code
# In-Progress = 202
# Complete = 200
if r.status_code == 200:
# complete
response = r.json()
self.ndjson_to_download = response["output"]
return self.ndjson_to_download
elif r.status_code == 202:
# in progress
delay = r.headers["Retry-After"]
sleep(int(delay))
else:
raise RuntimeError(r.text)
def download_data(self):
self._start_bulk_data_export()
self._wait_for_bulk_data_export_to_be_ready_for_download()
for ndjson_file_info in self.ndjson_to_download:
download_url = ndjson_file_info["url"]
resource_type = ndjson_file_info["type"]
# Download NDJSON
r = self.session.get(
download_url,
headers={
"Authorization": f"Bearer {self.get_token()}",
"Accept": "application/fhir+ndjson",
},
)
# Convert NDJSON into native Python objects and store for
# later processing
ndjson = r.text.strip()
for line in ndjson.split("\n"):
self.downloaded_resources[resource_type].append(json.loads(line))
def fhir_to_dataframes(self):
# Check to make sure data are downloaded; download if they aren't
if len(self.downloaded_resources) == 0:
self.download_data()
dfs = {}
for resource_type, view_definition in self.view_definitions.items():
dfs[resource_type] = pd.DataFrame(sqlonfhir.evaluate(
self.downloaded_resources[resource_type], view_definition
))
return dfs# And then to invoke it:
# Create a BulkDataFetcher with our credentials
fetcher = BulkDataFetcher(
base_url=server_url, client_id=client_id, private_key=private_key, key_id=key_id, session=session
)
# Define a ViewDefinition
fetcher.view_definitions.patient = {
"resource": "Patient",
"select": [
{
"column": [
{"name": "id", "path": "getResourceKey()"},
{"name": "sex", "path": "gender"},
{"name": "birth_date", "path": "birthDate"},
{"name": "marital_status", "path": "maritalStatus.coding.first().code"},
]
},
],
}Download the data:
fetcher.download_data()Fetching from https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWw iOjB9/fhir/Patient/$export?_type=Patient
Convert the FHIR data into a DataFrame:
dfs = fetcher.fhir_to_dataframes()
dfs['Patient']| id | sex | birth_date | marital_status | |
|---|---|---|---|---|
| 0 | 58c297c4-d684-4677-8024-01131d93835e | female | 1971-04-05 | M |
| 1 | 588675dc-e80e-4528-a78f-af10f9755f23 | male | 2015-10-04 | S |
| 2 | 118616a4-f0b2-411f-8050-39d5d27c738c | male | 1995-05-04 | S |
| 3 | c852042b-1373-45e6-acb5-f252c733de3a | male | 1991-08-15 | M |
| 4 | 21fba439-ca79-411f-a081-37a432a78f3a | female | 1964-12-09 | M |
| ... | ... | ... | ... | ... |
| 95 | daf4e787-0ea5-45ff-a9a1-c68308e9f6a3 | male | 1995-06-21 | S |
| 96 | ffa66c0a-87b0-4b78-bb7e-3a5c7c44d359 | male | 1926-06-21 | S |
| 97 | c2f03d5b-7219-4dec-bcc6-03ee42166769 | female | 1969-11-09 | S |
| 98 | 39699402-f976-4fa0-8635-f015a54f1f8b | male | 2003-03-02 | S |
| 99 | 7cdaae04-4ce7-4a6d-b6a3-5cddba9bc888 | female | 1959-07-23 | M |
100 rows × 4 columns
Add another resource type:
fetcher.view_definitions.condition = {
"resource": "Condition",
"select": [
{
"column": [
{"name": "condition_id", "path": "getResourceKey()"},
{"name": "patient_id", "path": "subject.getReferenceKey(Patient)"},
{"name": "code", "path": "code.coding.first().code"},
{"name": "code_display", "path": "code.coding.first().display"},
]
},
],
}
# Only the resource types specifically requested are downloaded, so if you
# add a new resource type, you need to download data again. Since this will
# re-download the Patient data too, clear the existing downloaded_resources
# dict to avoid appending duplicate Patient resources.
fetcher.downloaded_resources.clear()
fetcher.download_data()
dfs = fetcher.fhir_to_dataframes()
# Display the newly downloaded Conditions data
dfs['Condition']Fetching from https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWw iOjB9/fhir/Patient/$export?_type=Patient,Condition
| condition_id | patient_id | code | code_display | |
|---|---|---|---|---|
| 0 | 33ed785d-01d0-4a52-946e-60f04a957106 | 58c297c4-d684-4677-8024-01131d93835e | 40055000 | Chronic sinusitis (disorder) |
| 1 | 0b32d7c5-581e-45f3-bf8b-22fbbefd7618 | 58c297c4-d684-4677-8024-01131d93835e | 128613002 | Seizure disorder |
| 2 | ee3529f2-d48b-4c1d-9901-6672772c0cee | 58c297c4-d684-4677-8024-01131d93835e | 703151001 | History of single seizure (situation) |
| 3 | d3dbfce5-c2c5-43b3-9ad3-9cb4b6e62d98 | 58c297c4-d684-4677-8024-01131d93835e | 15777000 | Prediabetes |
| 4 | 02f208e8-ed4a-4395-a6c3-892b576d964a | 58c297c4-d684-4677-8024-01131d93835e | 271737000 | Anemia (disorder) |
| ... | ... | ... | ... | ... |
| 634 | 90cf23ae-d574-4ead-b5ea-e468128bbf22 | 7cdaae04-4ce7-4a6d-b6a3-5cddba9bc888 | 55822004 | Hyperlipidemia |
| 635 | 5395b7d3-2765-4079-98e3-033d9689df6a | 7cdaae04-4ce7-4a6d-b6a3-5cddba9bc888 | 10509002 | Acute bronchitis (disorder) |
| 636 | 2e288251-42fd-442c-bae1-b0d5f512d3f1 | 7cdaae04-4ce7-4a6d-b6a3-5cddba9bc888 | 195662009 | Acute viral pharyngitis (disorder) |
| 637 | 0a1f4fdc-7ae5-4b53-b586-fb78dff46f16 | 7cdaae04-4ce7-4a6d-b6a3-5cddba9bc888 | 65363002 | Otitis media |
| 638 | d949d776-2062-41ad-8149-c8effc6abb6d | 7cdaae04-4ce7-4a6d-b6a3-5cddba9bc888 | 64859006 | Osteoporosis (disorder) |
639 rows × 4 columns
6 Group export
§170.315(g)(10) Standardized API for patient and population services requires group-export as of December 2022.
This is therefore the FHIR Bulk Data endpoint you are likely to find in EHRs.
To use this endpoint, you will need the ID of the group of patients you want to export. In a production setting, this would typically be provided by the administrators of the EHR.
For the bulk-data.smarthealthit.org testing server, we can ask it for a list of groups via the FHIR API:
r = session.get(f'{server_url}/Group', headers={'Authorization': f'Bearer {get_token()}', 'Accept': 'application/fhir+json'})
r.json(){'resourceType': 'Bundle',
'id': 'f41b5caca4db39ae4d0a0f84549d6bd063174d72caae9cd4b2a719f564a57e59',
'meta': {'lastUpdated': '2026-05-31 21:33:32'},
'type': 'searchset',
'total': 8,
'link': [{'relation': 'self',
'url': 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWwiOjB9/fhir/Group'}],
'entry': [{'fullUrl': 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWwiOjB9/fhir/Group/BMCHealthNet',
'resource': {'resourceType': 'Group',
'id': 'BMCHealthNet',
'identifier': [{'system': 'https://bulk-data/db-id',
'value': 'BMCHealthNet'}],
'quantity': 10,
'name': 'BMC HealthNet',
'text': {'status': 'generated',
'div': '<div xmlns="http://www.w3.org/1999/xhtml">BMC HealthNet</div>'},
'type': 'person',
'actual': True}},
{'fullUrl': 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWwiOjB9/fhir/Group/BlueCrossBlueShield',
'resource': {'resourceType': 'Group',
'id': 'BlueCrossBlueShield',
'identifier': [{'system': 'https://bulk-data/db-id',
'value': 'BlueCrossBlueShield'}],
'quantity': 27,
'name': 'Blue Cross Blue Shield',
'text': {'status': 'generated',
'div': '<div xmlns="http://www.w3.org/1999/xhtml">Blue Cross Blue Shield</div>'},
'type': 'person',
'actual': True}},
{'fullUrl': 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWwiOjB9/fhir/Group/FallonHealth',
'resource': {'resourceType': 'Group',
'id': 'FallonHealth',
'identifier': [{'system': 'https://bulk-data/db-id',
'value': 'FallonHealth'}],
'quantity': 3,
'name': 'Fallon Health',
'text': {'status': 'generated',
'div': '<div xmlns="http://www.w3.org/1999/xhtml">Fallon Health</div>'},
'type': 'person',
'actual': True}},
{'fullUrl': 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWwiOjB9/fhir/Group/HarvardPilgrimHealthCare',
'resource': {'resourceType': 'Group',
'id': 'HarvardPilgrimHealthCare',
'identifier': [{'system': 'https://bulk-data/db-id',
'value': 'HarvardPilgrimHealthCare'}],
'quantity': 3,
'name': 'Harvard Pilgrim Health Care',
'text': {'status': 'generated',
'div': '<div xmlns="http://www.w3.org/1999/xhtml">Harvard Pilgrim Health Care</div>'},
'type': 'person',
'actual': True}},
{'fullUrl': 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWwiOjB9/fhir/Group/HealthNewEngland',
'resource': {'resourceType': 'Group',
'id': 'HealthNewEngland',
'identifier': [{'system': 'https://bulk-data/db-id',
'value': 'HealthNewEngland'}],
'quantity': 25,
'name': 'Health New England',
'text': {'status': 'generated',
'div': '<div xmlns="http://www.w3.org/1999/xhtml">Health New England</div>'},
'type': 'person',
'actual': True}},
{'fullUrl': 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWwiOjB9/fhir/Group/MinutemanHealth',
'resource': {'resourceType': 'Group',
'id': 'MinutemanHealth',
'identifier': [{'system': 'https://bulk-data/db-id',
'value': 'MinutemanHealth'}],
'quantity': 3,
'name': 'Minuteman Health',
'text': {'status': 'generated',
'div': '<div xmlns="http://www.w3.org/1999/xhtml">Minuteman Health</div>'},
'type': 'person',
'actual': True}},
{'fullUrl': 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWwiOjB9/fhir/Group/NeighborhoodHealthPlan',
'resource': {'resourceType': 'Group',
'id': 'NeighborhoodHealthPlan',
'identifier': [{'system': 'https://bulk-data/db-id',
'value': 'NeighborhoodHealthPlan'}],
'quantity': 7,
'name': 'Neighborhood Health Plan',
'text': {'status': 'generated',
'div': '<div xmlns="http://www.w3.org/1999/xhtml">Neighborhood Health Plan</div>'},
'type': 'person',
'actual': True}},
{'fullUrl': 'https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWwiOjB9/fhir/Group/TuftsHealthPlan',
'resource': {'resourceType': 'Group',
'id': 'TuftsHealthPlan',
'identifier': [{'system': 'https://bulk-data/db-id',
'value': 'TuftsHealthPlan'}],
'quantity': 22,
'name': 'Tufts Health Plan',
'text': {'status': 'generated',
'div': '<div xmlns="http://www.w3.org/1999/xhtml">Tufts Health Plan</div>'},
'type': 'person',
'actual': True}}]}
Let’s quickly pull this into a Pandas DataFrame to make it easier to read:
groups = pd.json_normalize(r.json()['entry'])[['resource.id', 'resource.name', 'resource.quantity']]
groups| resource.id | resource.name | resource.quantity | |
|---|---|---|---|
| 0 | BMCHealthNet | BMC HealthNet | 10 |
| 1 | BlueCrossBlueShield | Blue Cross Blue Shield | 27 |
| 2 | FallonHealth | Fallon Health | 3 |
| 3 | HarvardPilgrimHealthCare | Harvard Pilgrim Health Care | 3 |
| 4 | HealthNewEngland | Health New England | 25 |
| 5 | MinutemanHealth | Minuteman Health | 3 |
| 6 | NeighborhoodHealthPlan | Neighborhood Health Plan | 7 |
| 7 | TuftsHealthPlan | Tufts Health Plan | 22 |
Now we can request the patients and associated data for a specific group:
group_id = groups.loc[0, 'resource.id']
fetcher = BulkDataFetcher(
base_url=server_url, client_id=client_id, private_key=private_key, key_id=key_id, session=session,
# Tell the BulkDataFetcher to request data from the specified group
# rather than all patients
endpoint=f'Group/{group_id}'
)
fetcher.view_definitions.patient = {
"resource": "Patient",
"select": [
{
"column": [
{"name": "id", "path": "getResourceKey()"},
{"name": "sex", "path": "gender"},
{"name": "birth_date", "path": "birthDate"},
{"name": "marital_status", "path": "maritalStatus.coding.first().code"},
]
},
],
}
fetcher.download_data()
dfs = fetcher.fhir_to_dataframes()
dfs['Patient']Fetching from https://bulk-data.smarthealthit.org/eyJlcnIiOiIiLCJwYWdlIjoxMDAwMCwiZHVyIjoxMCwidGx0IjoxNSwibSI6MSwic3R1Ijo0LCJkZWw iOjB9/fhir/Group/BMCHealthNet/$export?_type=Patient
| id | sex | birth_date | marital_status | |
|---|---|---|---|---|
| 0 | 588675dc-e80e-4528-a78f-af10f9755f23 | male | 2015-10-04 | S |
| 1 | b4405d0c-89e6-4788-b34a-f013480249c3 | male | 1969-03-30 | M |
| 2 | 0e4c9b04-9248-4eeb-b4d0-30467390cc74 | male | 1938-12-03 | M |
| 3 | 8d3e1155-278a-4824-a7e0-fddb24c7c179 | male | 1991-10-27 | S |
| 4 | ee6df9f5-bb0b-44d5-9561-c7cb020d5ae7 | male | 2002-05-29 | S |
| 5 | b4cd3c8e-766c-4cb0-a1bf-f3b03d469785 | female | 1941-01-31 | S |
| 6 | 4b696341-294b-450b-98ed-390d3c957bd3 | female | 1941-12-14 | M |
| 7 | 62247e85-c8c1-4047-90b3-e0b3a9c59600 | female | 2018-12-14 | S |
| 8 | d33efe23-cef9-40e5-a7c0-6d68c1349e51 | male | 1994-05-31 | S |
| 9 | 39699402-f976-4fa0-8635-f015a54f1f8b | male | 2003-03-02 | S |
A number of different FHIR resources are available from the test server:
- AllergyIntolerance
- CarePlan
- CareTeam
- Claim
- Condition
- Device
- DiagnosticReport
- DocumentReference
- Encounter
- ExplanationOfBenefit
- ImagingStudy
- Immunization
- MedicationRequest
- Observation
- Patient
- Procedure
You can write your own ViewDefinitions using the pattern above to test out retrieving data for these different resource types.
7 Creating FHIRPaths
It may be helpful to use an online tool like https://hl7.github.io/fhirpath.js/ to assist with creating FHIRPaths for filtering the FHIR resources down for creating DataFrames. (Note that you should not use online tools like this with identified patient data.)
Given we are using synthetic data, it’s fine to copy/paste this into an online tool. Let’s grab one Patient resource instance from the last data pull we did for exploration with the FHIRPath tool.
print(json.dumps(fetcher.downloaded_resources['Patient'][0], indent=4)){ "resourceType": "Patient", "id": "588675dc-e80e-4528-a78f-af10f9755f23", "text": { "status": "generated", "div": "<div xmlns=\"http://www.w3.org/1999/xhtml\">Generated by <a href=\"https://github.com/synthetichealth/synthea\">Synthea</a>.Version identifier: v2.5.0-385-ge50db853\n . Person seed: 2528231247876604580 Population seed: 1588766244164</div>" }, "extension": [ { "url": "http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName", "valueString": "Ashely Considine" }, { "url": "http://hl7.org/fhir/StructureDefinition/patient-birthPlace", "valueAddress": { "city": "Boston", "state": "Massachusetts", "country": "US" } }, { "url": "http://synthetichealth.github.io/synthea/disability-adjusted-life-years", "valueDecimal": 0 }, { "url": "http://synthetichealth.github.io/synthea/quality-adjusted-life-years", "valueDecimal": 4 } ], "identifier": [ { "system": "https://github.com/synthetichealth/synthea", "value": "588675dc-e80e-4528-a78f-af10f9755f23" }, { "type": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/v2-0203", "code": "MR", "display": "Medical Record Number" } ], "text": "Medical Record Number" }, "system": "http://hospital.smarthealthit.org", "value": "588675dc-e80e-4528-a78f-af10f9755f23" }, { "type": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/v2-0203", "code": "SS", "display": "Social Security Number" } ], "text": "Social Security Number" }, "system": "http://hl7.org/fhir/sid/us-ssn", "value": "999-11-9545" } ], "name": [ { "use": "official", "family": "Bergstrom", "given": [ "Ben" ] } ], "telecom": [ { "system": "phone", "value": "555-593-7481", "use": "home" } ], "gender": "male", "birthDate": "2015-10-04", "address": [ { "extension": [ { "url": "http://hl7.org/fhir/StructureDefinition/geolocation", "extension": [ { "url": "latitude", "valueDecimal": 42.39313908431516 }, { "url": "longitude", "valueDecimal": -71.12231944002103 } ] } ], "line": [ "1079 Hermiston Road Suite 31" ], "city": "Brookline", "state": "Massachusetts", "country": "US" } ], "maritalStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus", "code": "S", "display": "Never Married" } ], "text": "Never Married" }, "multipleBirthBoolean": false, "communication": [ { "language": { "coding": [ { "system": "urn:ietf:bcp:47", "code": "en-US", "display": "English" } ], "text": "English" } } ] }
This can be copied and pasted into https://hl7.github.io/fhirpath.js/ to experiment with FHIRPaths. Note that the JavaScript library used on this testing website is not the same as the Python library used in this notebook, so there may be some implementation differences.
8 Testing with Synthea data
Having test data is very helpful when developing code that uses FHIR Bulk Data. The test data from https://bulk-data.smarthealthit.org may not have all the data elements you need for a specific research use case. Synthea can be used for generating customized synthetic data in FHIR format.
First, we’ll create a short class to mimic the functionality of BulkDataFetcher, but it will load the .ndjson file directly from disk rather than via a bulk data export.
class NDJSONFetcher:
def __init__(
self,
ndjson_file_path: str,
):
self.view_definitions = ViewDefinitions()
self.downloaded_resources = defaultdict(list)
self.resources_by_type = {}
num_lines = sum(1 for line in open(ndjson_file_path,'r'))
with open(ndjson_file_path, 'r') as file:
for line in tqdm(file, total=num_lines):
json_obj = json.loads(line)
this_resource_type = json_obj['resourceType']
if this_resource_type not in self.resources_by_type:
self.resources_by_type[this_resource_type] = []
self.resources_by_type[this_resource_type].append(json_obj)
print("Resources available: ")
print('\n'.join([f'- {x} (n={len(self.resources_by_type[x])})' for x in self.resources_by_type.keys()]))
def fhir_to_dataframes(self):
dfs = {}
for resource_type, view_definition in self.view_definitions.items():
dfs[resource_type] = pd.DataFrame(sqlonfhir.evaluate(
self.resources_by_type[resource_type], view_definition
))
return dfs
# Load in 10 patients of Synthea data.
# The original data come from <https://synthea.mitre.org/downloads> > 1K Sample Synthetic Patient Records, FHIR R4
synthea_fetcher = NDJSONFetcher('../workshops/bulk-data/synthea_10.ndjson')Resources available:
- Patient (n=10) - Organization (n=26) - Practitioner (n=26) - Encounter (n=308) - Condition (n=56) - Device (n=1) - Claim (n=347) - ExplanationOfBenefit (n=308) - CareTeam (n=23) - Goal (n=12) - CarePlan (n=23) - Observation (n=1317) - Immunization (n=149) - DiagnosticReport (n=64) - Procedure (n=243) - MedicationRequest (n=39) - ImagingStudy (n=2) - AllergyIntolerance (n=7)
Here is how to apply FHIRPaths to filter the Synthea data:
synthea_fetcher.view_definitions.patient = {
"resource": "Patient",
"select": [
{
"column": [
{"name": "id", "path": "getResourceKey()"},
{"name": "sex", "path": "gender"},
{"name": "birth_date", "path": "birthDate"},
{"name": "marital_status", "path": "maritalStatus.coding.first().code"},
]
},
],
}
dfs = synthea_fetcher.fhir_to_dataframes()
dfs['Patient']| id | sex | birth_date | marital_status | |
|---|---|---|---|---|
| 0 | 5cbc121b-cd71-4428-b8b7-31e53eba8184 | male | 1945-12-10 | S |
| 1 | adccf2c3-9dc4-4067-ba23-98982c4875da | male | 1946-03-29 | M |
| 2 | 31191928-6acb-4d73-931c-e601cc3a13fa | female | 2002-10-24 | S |
| 3 | 67816396-e325-496d-a6ec-c047756b7ce4 | male | 1999-12-12 | S |
| 4 | b426b062-8273-4b93-a907-de3176c0567d | male | 2002-04-15 | S |
| 5 | 5c818f3d-7051-4b86-8203-1dc624a91804 | male | 1997-12-26 | S |
| 6 | 346d4b95-5e00-48fe-a21e-076735ca1d74 | male | 2001-11-29 | S |
| 7 | 668605fe-a8dc-4601-ae48-f5bc24bbea74 | female | 1999-11-14 | S |
| 8 | b43d3d32-1e9b-4acb-8578-624cdcc99f86 | male | 2003-02-03 | S |
| 9 | 5dc02d13-5c69-4c36-87d5-16738f088300 | male | 1996-12-04 | S |
You can also get a sample resource to look at the raw JSON:
print(json.dumps(synthea_fetcher.resources_by_type['Patient'][0], indent=4)){ "resourceType": "Patient", "id": "5cbc121b-cd71-4428-b8b7-31e53eba8184", "text": { "status": "generated", "div": "<div xmlns=\"http://www.w3.org/1999/xhtml\">Generated by <a href=\"https://github.com/synthetichealth/synthea\">Synthea</a>.Version identifier: v2.4.0-404-ge7ce2295\n . Person seed: 6457100290386878904 Population seed: 0</div>" }, "extension": [ { "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race", "extension": [ { "url": "ombCategory", "valueCoding": { "system": "urn:oid:2.16.840.1.113883.6.238", "code": "2106-3", "display": "White" } }, { "url": "text", "valueString": "White" } ] }, { "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity", "extension": [ { "url": "ombCategory", "valueCoding": { "system": "urn:oid:2.16.840.1.113883.6.238", "code": "2186-5", "display": "Not Hispanic or Latino" } }, { "url": "text", "valueString": "Not Hispanic or Latino" } ] }, { "url": "http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName", "valueString": "Deadra347 Borer986" }, { "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex", "valueCode": "M" }, { "url": "http://hl7.org/fhir/StructureDefinition/patient-birthPlace", "valueAddress": { "city": "Billerica", "state": "Massachusetts", "country": "US" } }, { "url": "http://synthetichealth.github.io/synthea/disability-adjusted-life-years", "valueDecimal": 14.062655945052095 }, { "url": "http://synthetichealth.github.io/synthea/quality-adjusted-life-years", "valueDecimal": 58.93734405494791 } ], "identifier": [ { "system": "https://github.com/synthetichealth/synthea", "value": "2fa15bc7-8866-461a-9000-f739e425860a" }, { "type": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/v2-0203", "code": "MR", "display": "Medical Record Number" } ], "text": "Medical Record Number" }, "system": "http://hospital.smarthealthit.org", "value": "2fa15bc7-8866-461a-9000-f739e425860a" }, { "type": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/v2-0203", "code": "SS", "display": "Social Security Number" } ], "text": "Social Security Number" }, "system": "http://hl7.org/fhir/sid/us-ssn", "value": "999-93-7537" }, { "type": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/v2-0203", "code": "DL", "display": "Driver's License" } ], "text": "Driver's License" }, "system": "urn:oid:2.16.840.1.113883.4.3.25", "value": "S99948707" }, { "type": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/v2-0203", "code": "PPN", "display": "Passport Number" } ], "text": "Passport Number" }, "system": "http://standardhealthrecord.org/fhir/StructureDefinition/passportNumber", "value": "X14078167X" } ], "name": [ { "use": "official", "family": "Brekke496", "given": [ "Aaron697" ], "prefix": [ "Mr." ] } ], "telecom": [ { "system": "phone", "value": "555-677-3119", "use": "home" } ], "gender": "male", "birthDate": "1945-12-10", "address": [ { "extension": [ { "url": "http://hl7.org/fhir/StructureDefinition/geolocation", "extension": [ { "url": "latitude", "valueDecimal": 41.93879298871088 }, { "url": "longitude", "valueDecimal": -71.06682353144593 } ] } ], "line": [ "894 Brakus Bypass" ], "city": "Taunton", "state": "Massachusetts", "postalCode": "02718", "country": "US" } ], "maritalStatus": { "coding": [ { "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus", "code": "S", "display": "S" } ], "text": "S" }, "multipleBirthBoolean": false, "communication": [ { "language": { "coding": [ { "system": "urn:ietf:bcp:47", "code": "en-US", "display": "English" } ], "text": "English" } } ] }