Connect Soda to Dask and Pandas
Access configuration details to connect Soda to Dask and Pandas.
Connection configuration reference
Load CSV file into Dataframe
import pandas as pd
import dask
import dask.datasets
from soda.scan import Scan
# Read more info in "Note on new release" section
dask.config.set({"dataframe.convert-string": False})
# Create a Soda scan object
scan = Scan()
# Load timeseries data from dask datasets
df_timeseries = dask.datasets.timeseries().reset_index()
df_timeseries["email"] = "[email protected]"
# Create an artificial pandas dataframe
df_employee = pd.DataFrame({"email": ["[email protected]", "[email protected]", "[email protected]"]})
# Either add Dask dataframe to scan and assign a dataset name to refer from checks.yaml
scan.add_dask_dataframe(dataset_name="timeseries", dask_df=df_timeseries, data_source_name="orders")
# OR, add Pandas dataframe to scan and assign a dataset name to refer from checks.yaml
scan.add_pandas_dataframe(dataset_name="employee", pandas_df=df_employee, data_source_name="orders")
# Optionally, add multiple dataframes as unique data sources. Note the change of
# the data_source_name parameter.
scan.add_dask_dataframe(dataset_name="inquiries", dask_df=[...], data_source_name="customers")
# Set the scan definition name and default data source to use
scan.set_scan_definition_name("test")
scan.set_data_source_name("orders")
# Add configuration YAML file
# You do not need connection to a data source; you must have a connection to Soda Cloud
# Choose one of the following two options:
# 1) From a file
scan.add_configuration_yaml_file(file_path="~/.soda/configuration.yml")
# 2) Inline in the code
# For host, use cloud.soda.io for EU region; use cloud.us.soda.io for US region
scan.add_configuration_yaml_str(
"""
soda_cloud:
host: cloud.soda.io
api_key_id: 2e0ba0cb-your-api-key-7b
api_key_secret: 5wd-your-api-key-secret-aGuRg
"""
# Define checks in yaml format
# Alternatively, refer to a yaml file using scan.add_sodacl_yaml_file(<filepath>)
checks = """
for each dataset T:
datasets:
- include %
checks:
- row_count > 0
profile columns:
columns:
- employee.%
checks for employee:
- values in (email) must exist in timeseries (email) # Error expected
- row_count same as timeseries # Error expected
checks for timeseries:
- avg_x_minus_y between -1 and 1:
avg_x_minus_y expression: AVG(x - y)
- failed rows:
samples limit: 50
fail condition: x >= 3
- schema:
name: Confirm that required columns are present
warn:
when required column missing: [x]
when forbidden column present: [email]
when wrong column type:
email: varchar
fail:
when required column missing:
- y
- invalid_count(email) = 0:
valid format: email
- valid_count(email) > 0:
valid format: email
"""
scan.add_sodacl_yaml_str(checks)
scan.set_verbose(True)
scan.execute()Load JSON file into Dataframe
Add optional parameter for COUNT
COUNTParameter setting
behavior
Add optional parameter for text data conversion
Troubleshoot
Last updated
Was this helpful?
