Complete guide to generating realistic mock data with perfect referential integrity
# Blueprint.yaml
MyEntity:
count: 10 # Number of records to generate
fields:
MyField 1:
generator: generator_name
config:
# ... generator-specific optionsstart_at (optional, default: 1) - Starting integer valueincrement (optional, default: 1) - The value to increment by
users:
count: 5
fields:
user_id:
generator: sequence
config:
start_at: 1generator (required) - Faker provider (name, email, address, etc.)locale (optional, default: en_US) - Language/region (de_DE, fr_FR, ja_JP)... - Additional provider-specific arguments
users:
count: 5
fields:
full_name:
generator: faker
config:
generator: "name"
users:
count: 5
fields:
full_name:
generator: faker
config:
generator: "name"
locale: "de_DE"namefirst_namelast_nameemailaddressphone_numbercompanyjobtextsentenceurlipv4date_of_birthpyintpyfloatpydecimaluuid4color_namefile_namechoices (required) - List of values to choose fromweights (optional) - Probability weights for each choice
MyEntity:
count: 5
fields:
status:
generator: choice
config:
choices: ["pending", "active", "completed"]
MyEntity:
count: 5
fields:
status:
generator: choice
config:
choices: ["pending", "active", "completed"]
weights: [0.3, 0.5, 0.2]start_date (required, String) - Start of date rangeend_date (required, String) - End of date range. Must be after start_dateformat (optional, String) - Output format using strftime directives. If not provided, the timestamp will be in ISO 8601 format
MyEntity:
count: 5
fields:
created_at:
generator: timestamp
config:
start_date: "2023-01-01"
end_date: "2023-12-31"
MyEntity:
count: 5
fields:
created_at:
generator: timestamp
config:
start_date: "2023-01-01:00:00"
end_date: "2024-01-01 23:59:59"
format: "%Y-%m-%d %H:%M"ref: (Required, String) - Primary reference value in the form EntitiyName.Field Name (Ex: Users.id)
users:
count: 2
fields:
id:
generator: sequence
config:
start_at: 1
name
generator: faker
config:
generator: "name"
orders
count: 5
fields:
order_id:
generator: sequence
config:
start_at: 101
customer_id: # Primary reference
generator: ref
config:
ref: "users.id" # Customer Id is filled with a random user id
use_record_from: (Required, String) - Primary reference entity whose additional values should be extractedfield_to_get: (Required, String) - Field value to extract
users:
count: 2
fields:
id:
generator: sequence
config:
start_at: 1
name
generator: faker
config:
generator: "name"
orders
count: 5
fields:
order_id:
generator: sequence
config:
start_at: 101
customer_id: # Primary reference
generator: ref
config:
ref: users.id # Customer Id is filled with a random user id
customer_name: # Secondary reference
generator: ref
config:
user_record_from: customer_id # The User record used for customer_id field is used
field_to_get: name
Products:
count: 20
fields:
product_id:
generator: sequence
config:
start_at: 201
name:
generator: faker
config:
generator: catch_phrase
price:
generator: faker
config:
generator: pydecimal
left_digits: 2
right_digits: 2
positive: true
OrderItems:
count: 100
fields:
item_id:
generator: sequence
config:
start_at: 7001
order_id:
generator: ref
config:
ref: Orders.order_id
product_id:
generator: ref
config:
ref: Products.product_id
unit_price:
generator: ref
config:
use_record_from: product_id
field_to_get: price
expression (required) - Python expression to evaluatecurrentrandommathdatetimeuuid4sumlenminmaxstrintfloat
OrderItems:
count: 5
fields:
price:
generator: faker
config:
generator: pydecimal
left_digits: 2
right_digits: 2
positive: true
quantity:
generator: expr
config:
expression: random.randint(1, 25) # Random quantity
total_price:
generator: expr
config:
expression: current['price'] * current['quantity'] # Calculate price based on generated quantity
category:
generator: expr
config:
expression: "'bulk' if current['quantity'] > 5 else 'regular'" # Conditional logic
values (required) - List of values to cycle through
Items:
count: 5
fields:
id:
generator: sequence
config:
start_at: 100
status:
generator: enum
config:
values: ["active", "pending", "expired"]
--format - Output format. Allowed values are csv (default), json, parquet--seed - Random seed for reproducible data--output - Output directoryCreates a sample Blueprint.yaml file to get you started.
--output - The name of the blueprint file to generate. Defaule is 'Blueprint.yaml'.