Raw vs Semantic Layers Architecture
Understanding the distinction between raw and semantic data layers is
essential for building effective analytics systems on Spartera. This
architecture pattern separates data storage concerns from business
logic, enabling better AI comprehension, maintainability, and
performance optimization.
Layer Definitions
Raw Layer (Bronze/Landing Layer)
The raw layer contains data in its most unprocessed form, exactly as
received from source systems.
Characteristics:
- Unmodified data from source systems
- Preserves original data types and formats
- Includes all fields from source systems
- Immutable once loaded
- Optimized for data ingestion and storage
Semantic Layer (Gold/Business Layer)
The semantic layer provides business-friendly, cleaned, and enriched
data ready for analytics consumption.
Characteristics:
- Business-friendly column names and values
- Data quality rules applied
- Business logic implemented
- Enriched with calculated fields
- Optimized for analytics queries
Architectural Patterns
Traditional Three-Layer Architecture
-- RAW LAYER: Direct from source systems
CREATE TABLE raw_customer_data (
cust_id VARCHAR(50),
fname VARCHAR(100),
lname VARCHAR(100),
email_addr VARCHAR(200),
reg_dt TIMESTAMP,
stat_cd VARCHAR(10),
ltv_val DECIMAL(15,2),
src_sys VARCHAR(50),
load_ts TIMESTAMP
);
-- REFINED LAYER: Cleaned and standardized
CREATE TABLE refined_customer_data (
customer_id BIGINT,
first_name VARCHAR(100),
last_name VARCHAR(100),
email_address VARCHAR(200),
registration_date DATE,
customer_status VARCHAR(20),
lifetime_value DECIMAL(12,2),
source_system VARCHAR(50),
data_quality_score DECIMAL(3,2),
last_updated_timestamp TIMESTAMP
);
-- SEMANTIC LAYER: Business-ready analytics data
CREATE TABLE customer_analytics (
customer_id BIGINT,
customer_full_name VARCHAR(200),
email_address VARCHAR(200),
registration_date DATE,
customer_status_description VARCHAR(50),
customer_lifetime_value_usd DECIMAL(12,2),
customer_segment VARCHAR(50),
acquisition_channel VARCHAR(100),
days_since_registration INTEGER,
is_high_value_customer BOOLEAN,
predicted_churn_risk DECIMAL(5,4),
last_purchase_date DATE,
total_orders INTEGER,
average_order_value DECIMAL(10,2)
);
Modern ELT Pattern
-- RAW LAYER: Staging area for all source data
CREATE SCHEMA raw_data;
CREATE TABLE raw_data.ecommerce_transactions (
transaction_raw_json JSON,
source_file_name VARCHAR(500),
ingestion_timestamp TIMESTAMP,
partition_date DATE
) PARTITION BY (partition_date);
-- SEMANTIC LAYER: Business logic applied via views/tables
CREATE SCHEMA analytics;
CREATE TABLE analytics.transaction_facts AS
SELECT
JSON_EXTRACT_SCALAR(transaction_raw_json, '$.transaction_id')
AS transaction_id,
JSON_EXTRACT_SCALAR(transaction_raw_json, '$.customer_id')
AS customer_id,
CAST(JSON_EXTRACT_SCALAR(transaction_raw_json, '$.amount')
AS DECIMAL(10,2)) AS transaction_amount_usd,
PARSE_DATETIME('%Y-%m-%d %H:%M:%S',
JSON_EXTRACT_SCALAR(transaction_raw_json, '$.timestamp'))
AS transaction_datetime,
JSON_EXTRACT_SCALAR(transaction_raw_json, '$.product_sku')
AS product_sku,
JSON_EXTRACT_SCALAR(transaction_raw_json, '$.payment_method')
AS payment_method_code,
-- Business logic enrichment
CASE
WHEN JSON_EXTRACT_SCALAR(transaction_raw_json,
'$.payment_method') = 'CC' THEN 'Credit Card'
WHEN JSON_EXTRACT_SCALAR(transaction_raw_json,
'$.payment_method') = 'PP' THEN 'PayPal'
WHEN JSON_EXTRACT_SCALAR(transaction_raw_json,
'$.payment_method') = 'BT' THEN 'Bank Transfer'
ELSE 'Other'
END AS payment_method_description,
CASE
WHEN CAST(JSON_EXTRACT_SCALAR(transaction_raw_json,
'$.amount') AS DECIMAL(10,2)) >= 1000 THEN 'High Value'
WHEN CAST(JSON_EXTRACT_SCALAR(transaction_raw_json,
'$.amount') AS DECIMAL(10,2)) >= 100 THEN 'Medium Value'
ELSE 'Low Value'
END AS transaction_value_tier
FROM raw_data.ecommerce_transactions
WHERE partition_date >= CURRENT_DATE - INTERVAL '90' DAY;
Implementation Strategies
1. Raw Layer Design Principles
Preserve Data Integrity
-- Include metadata for traceability
CREATE TABLE raw_sales_data (
-- Original data
raw_record_json JSON,
-- Metadata
source_system_id VARCHAR(50),
extraction_timestamp TIMESTAMP,
file_batch_id VARCHAR(100),
record_hash VARCHAR(64), -- For deduplication
data_lineage_info JSON
);
Schema Evolution Support
-- Use flexible schemas that can accommodate changes
CREATE TABLE raw_user_events (
event_id BIGINT,
event_payload JSON, -- Flexible for schema changes
event_version VARCHAR(10), -- Track payload structure versions
ingestion_time TIMESTAMP,
source_app VARCHAR(50)
);
2. Semantic Layer Design Principles
Business-Friendly Naming
-- Good: Clear, descriptive business terminology
CREATE TABLE customer_purchase_behavior (
customer_identifier BIGINT,
customer_full_name VARCHAR(200),
total_lifetime_purchases_usd DECIMAL(15,2),
average_days_between_purchases DECIMAL(8,2),
preferred_product_category VARCHAR(100),
customer_acquisition_date DATE,
is_premium_customer BOOLEAN,
predicted_next_purchase_date DATE
);
-- Avoid: Technical jargon or cryptic abbreviations
CREATE TABLE cust_purch_bhvr (
cid BIGINT,
nm VARCHAR(200),
ltv DECIMAL(15,2),
avg_dys DECIMAL(8,2),
prd_cat VARCHAR(100),
acq_dt DATE,
prem_flg BOOLEAN,
nxt_purch_dt DATE
);
Business Logic Implementation
-- Implement complex business rules in semantic layer
CREATE VIEW customer_health_score AS
SELECT
customer_id,
customer_name,
-- Business metrics calculation
CASE
WHEN days_since_last_purchase <= 30
AND total_orders >= 10 THEN 'Highly Engaged'
WHEN days_since_last_purchase <= 60
AND total_orders >= 5 THEN 'Engaged'
WHEN days_since_last_purchase <= 180
AND total_orders >= 2 THEN 'At Risk'
ELSE 'Churned'
END AS customer_engagement_status,
-- Calculated business value
(total_lifetime_value / NULLIF(total_orders, 0))
AS average_order_value_usd,
-- Predictive indicators
CASE
WHEN customer_engagement_status = 'Churned'
AND total_lifetime_value > 5000
THEN 'High Priority Winback'
WHEN customer_engagement_status = 'At Risk'
AND average_order_value_usd > 200
THEN 'Retention Campaign'
ELSE 'Standard'
END AS recommended_action
FROM customer_base_metrics;
Data Quality and Governance
Raw Layer Quality Checks
-- Data validation at ingestion
CREATE TABLE raw_data_quality_log (
batch_id VARCHAR(100),
table_name VARCHAR(100),
validation_timestamp TIMESTAMP,
total_records INTEGER,
valid_records INTEGER,
invalid_records INTEGER,
validation_rules_applied TEXT[],
data_quality_score DECIMAL(5,4)
);
-- Example validation for raw data
INSERT INTO raw_data_quality_log
SELECT
'${batch_id}' as batch_id,
'raw_customer_data' as table_name,
CURRENT_TIMESTAMP as validation_timestamp,
COUNT(*) as total_records,
COUNT(CASE WHEN email_addr ~
'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'
THEN 1 END) as valid_records,
COUNT(CASE WHEN email_addr !~
'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'
OR email_addr IS NULL THEN 1 END) as invalid_records,
ARRAY['email_format_check', 'null_check']
as validation_rules_applied,
COUNT(CASE WHEN email_addr ~
'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'
THEN 1 END)::DECIMAL / COUNT(*)::DECIMAL
as data_quality_score
FROM raw_customer_data
WHERE load_ts >= CURRENT_DATE;
Semantic Layer Quality Assurance
-- Business rule validation
CREATE VIEW semantic_data_quality_checks AS
SELECT
'customer_analytics' as table_name,
'customer_lifetime_value_consistency' as check_name,
COUNT(*) as total_records,
COUNT(CASE WHEN customer_lifetime_value_usd < 0 THEN 1 END)
as failed_records,
'Customer lifetime value should not be negative' as business_rule
FROM customer_analytics
UNION ALL
SELECT
'customer_analytics' as table_name,
'registration_date_validity' as check_name,
COUNT(*) as total_records,
COUNT(CASE WHEN registration_date > CURRENT_DATE THEN 1 END)
as failed_records,
'Registration date should not be in the future' as business_rule
FROM customer_analytics;
Performance Optimization Patterns
Raw Layer Optimization
-- Partition by ingestion date for efficient data management
CREATE TABLE raw_transaction_stream (
transaction_data JSON,
ingestion_timestamp TIMESTAMP,
source_system VARCHAR(50)
)
PARTITION BY DATE(ingestion_timestamp)
CLUSTER BY source_system;
-- Create indexes for common access patterns
CREATE INDEX idx_raw_ingestion_time
ON raw_transaction_stream(ingestion_timestamp);
CREATE INDEX idx_raw_source_system
ON raw_transaction_stream(source_system);
Semantic Layer Optimization
-- Materialized views for complex business logic
CREATE MATERIALIZED VIEW customer_360_summary AS
SELECT
c.customer_id,
c.customer_name,
c.registration_date,
-- Aggregate metrics from multiple sources
COUNT(DISTINCT o.order_id) as total_orders,
SUM(o.order_total) as lifetime_value_usd,
AVG(o.order_total) as average_order_value_usd,
MAX(o.order_date) as last_order_date,
-- Business categorization
CASE
WHEN SUM(o.order_total) >= 10000 THEN 'VIP'
WHEN SUM(o.order_total) >= 1000 THEN 'Premium'
ELSE 'Standard'
END as customer_tier,
-- Engagement metrics
DATE_DIFF(CURRENT_DATE, MAX(o.order_date), DAY)
as days_since_last_order
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.customer_name, c.registration_date;
-- Refresh strategy
CREATE OR REPLACE PROCEDURE refresh_customer_360()
BEGIN
REFRESH MATERIALIZED VIEW customer_360_summary;
END;
Integration with Spartera
Best Practices for AI Analytics
1. Semantic Layer for AI Consumption
-- Design semantic tables with AI-friendly characteristics
CREATE TABLE ai_ready_customer_metrics (
customer_unique_identifier BIGINT
COMMENT 'Primary customer identifier',
customer_acquisition_date DATE
COMMENT 'Date when customer first registered',
customer_lifetime_value_dollars DECIMAL(12,2)
COMMENT 'Total revenue generated by customer',
customer_satisfaction_score DECIMAL(3,2)
COMMENT 'Score from 1-5 based on surveys and interactions',
predicted_churn_probability DECIMAL(5,4)
COMMENT 'ML model prediction of churn likelihood',
preferred_contact_method VARCHAR(50)
COMMENT 'Email, Phone, SMS, or In-App notification',
average_monthly_spending DECIMAL(10,2)
COMMENT 'Average amount spent per month over last 12 months',
product_category_affinity VARCHAR(100)
COMMENT 'Primary product category customer purchases most',
geographic_region VARCHAR(100)
COMMENT 'Customer geographic location for regional analysis'
);
2. Data Lineage and Documentation
-- Include comprehensive metadata for AI understanding
CREATE TABLE semantic_table_metadata (
table_name VARCHAR(100),
column_name VARCHAR(100),
business_definition TEXT,
calculation_logic TEXT,
data_source_tables VARCHAR(500),
update_frequency VARCHAR(50),
data_quality_threshold DECIMAL(5,4)
);
-- Example metadata entries
INSERT INTO semantic_table_metadata VALUES
('customer_analytics', 'customer_lifetime_value_usd',
'Total revenue generated by customer across all time',
'SUM(order_total) FROM orders WHERE customer_id = customer_analytics.customer_id',
'raw_orders, raw_payments, raw_refunds',
'Daily at 2 AM UTC',
0.95);
3. Consistent Business Metrics
-- Standardize business metric calculations across semantic layer
CREATE VIEW standard_business_metrics AS
SELECT
metric_name,
metric_definition,
calculation_formula,
business_owner
FROM (
VALUES
('Monthly Active Users',
'Unique users with at least one session in the last 30 days',
'COUNT(DISTINCT user_id) WHERE last_activity >= CURRENT_DATE - 30',
'Product Team'),
('Customer Lifetime Value',
'Total revenue from customer over their entire relationship',
'SUM(order_total) / COUNT(DISTINCT customer_id)',
'Finance Team'),
('Average Order Value',
'Mean value of orders in specified time period',
'SUM(order_total) / COUNT(order_id)',
'Sales Team')
) AS metrics(metric_name, metric_definition, calculation_formula,
business_owner);
Migration Strategies
From Raw to Semantic Implementation
Phase 1: Raw Layer Foundation
-- 1. Establish raw data ingestion
CREATE PROCEDURE ingest_raw_data(
source_table VARCHAR,
target_raw_table VARCHAR
)
BEGIN
INSERT INTO ${target_raw_table} (
raw_data_json,
source_system,
ingestion_timestamp,
batch_id
)
SELECT
TO_JSON(source.*),
'${source_system}',
CURRENT_TIMESTAMP,
'${batch_id}'
FROM ${source_table} source;
END;
Phase 2: Incremental Semantic Layer
-- 2. Build semantic transformations incrementally
CREATE VIEW semantic_customers_v1 AS
SELECT
CAST(JSON_EXTRACT_SCALAR(raw_data_json, '$.customer_id')
AS BIGINT) as customer_id,
JSON_EXTRACT_SCALAR(raw_data_json, '$.first_name') || ' ' ||
JSON_EXTRACT_SCALAR(raw_data_json, '$.last_name')
as customer_full_name,
PARSE_DATE('%Y-%m-%d',
JSON_EXTRACT_SCALAR(raw_data_json, '$.registration_date'))
as customer_registration_date
FROM raw_customer_data
WHERE ingestion_timestamp >= CURRENT_DATE - INTERVAL '1' DAY;
-- 3. Evolve to more complex business logic
CREATE VIEW semantic_customers_v2 AS
SELECT
*,
CASE
WHEN DATE_DIFF(CURRENT_DATE, customer_registration_date, DAY)
<= 30 THEN 'New Customer'
WHEN DATE_DIFF(CURRENT_DATE, customer_registration_date, DAY)
<= 365 THEN 'Established Customer'
ELSE 'Long-term Customer'
END as customer_tenure_category
FROM semantic_customers_v1;
Common Anti-Patterns to Avoid
1. Raw Layer Pollution
-- WRONG: Adding business logic to raw layer
CREATE TABLE raw_sales_data (
transaction_id BIGINT,
amount DECIMAL(10,2),
-- Don't do this in raw layer:
amount_category VARCHAR(20), -- Business logic belongs in semantic layer
is_high_value BOOLEAN -- Derived fields belong in semantic layer
);
-- CORRECT: Keep raw layer pure
CREATE TABLE raw_sales_data (
transaction_id BIGINT,
amount DECIMAL(10,2),
transaction_timestamp TIMESTAMP,
source_system VARCHAR(50),
ingestion_timestamp TIMESTAMP
);
2. Semantic Layer Over-complexity
-- WRONG: Too many nested calculations making it hard to understand
CREATE VIEW complex_customer_metrics AS
SELECT
customer_id,
(
CASE WHEN (
SELECT AVG(
CASE WHEN order_total > (
SELECT PERCENTILE_CONT(0.75)
WITHIN GROUP (ORDER BY order_total)
FROM orders o2 WHERE o2.customer_id = o1.customer_id
) THEN 1 ELSE 0 END
) FROM orders o1 WHERE o1.customer_id = customers.customer_id
) > 0.5 THEN 'Premium' ELSE 'Standard'
END
) as customer_segment; -- Too complex!
-- CORRECT: Break down complex logic into understandable steps
CREATE VIEW customer_order_percentiles AS
SELECT
customer_id,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY order_total)
as p75_order_value
FROM orders
GROUP BY customer_id;
CREATE VIEW customer_premium_indicator AS
SELECT
o.customer_id,
AVG(CASE WHEN o.order_total > p.p75_order_value
THEN 1 ELSE 0 END) as premium_order_ratio
FROM orders o
JOIN customer_order_percentiles p ON o.customer_id = p.customer_id
GROUP BY o.customer_id;
CREATE VIEW customer_segments AS
SELECT
customer_id,
CASE WHEN premium_order_ratio > 0.5
THEN 'Premium' ELSE 'Standard' END as customer_segment
FROM customer_premium_indicator;
3. Inconsistent Layer Definitions
-- WRONG: Mixing raw and semantic data in same table
CREATE TABLE mixed_customer_data (
raw_customer_json JSON, -- Raw layer data
customer_full_name VARCHAR(200), -- Semantic layer data
source_file VARCHAR(500), -- Raw layer metadata
customer_tier VARCHAR(50) -- Semantic layer business logic
);
-- CORRECT: Clear separation of concerns
CREATE TABLE raw_customer_data (
customer_raw_json JSON,
source_file VARCHAR(500),
ingestion_timestamp TIMESTAMP
);
CREATE TABLE semantic_customer_profile (
customer_id BIGINT,
customer_full_name VARCHAR(200),
customer_tier VARCHAR(50),
last_updated_timestamp TIMESTAMP
);
Monitoring and Maintenance
Data Freshness Monitoring
-- Monitor data pipeline health
CREATE VIEW data_pipeline_health AS
SELECT
'raw_layer' as layer_type,
table_name,
MAX(ingestion_timestamp) as last_update,
DATE_DIFF(CURRENT_TIMESTAMP, MAX(ingestion_timestamp), MINUTE)
as minutes_since_last_update,
CASE
WHEN DATE_DIFF(CURRENT_TIMESTAMP, MAX(ingestion_timestamp),
MINUTE) > 60 THEN 'STALE'
WHEN DATE_DIFF(CURRENT_TIMESTAMP, MAX(ingestion_timestamp),
MINUTE) > 30 THEN 'WARNING'
ELSE 'FRESH'
END as freshness_status
FROM (
SELECT 'raw_customer_data' as table_name, ingestion_timestamp
FROM raw_customer_data
UNION ALL
SELECT 'raw_order_data' as table_name, ingestion_timestamp
FROM raw_order_data
) raw_tables
GROUP BY table_name
UNION ALL
SELECT
'semantic_layer' as layer_type,
table_name,
MAX(last_updated_timestamp) as last_update,
DATE_DIFF(CURRENT_TIMESTAMP, MAX(last_updated_timestamp), MINUTE)
as minutes_since_last_update,
CASE
WHEN DATE_DIFF(CURRENT_TIMESTAMP, MAX(last_updated_timestamp),
MINUTE) > 120 THEN 'STALE'
WHEN DATE_DIFF(CURRENT_TIMESTAMP, MAX(last_updated_timestamp),
MINUTE) > 60 THEN 'WARNING'
ELSE 'FRESH'
END as freshness_status
FROM (
SELECT 'customer_analytics' as table_name, last_updated_timestamp
FROM customer_analytics
UNION ALL
SELECT 'order_analytics' as table_name, last_updated_timestamp
FROM order_analytics
) semantic_tables
GROUP BY table_name;
Performance Monitoring
-- Track query performance across layers
CREATE TABLE query_performance_metrics (
query_timestamp TIMESTAMP,
layer_type VARCHAR(20),
table_name VARCHAR(100),
query_duration_seconds DECIMAL(10,3),
rows_processed BIGINT,
query_type VARCHAR(50) -- SELECT, INSERT, UPDATE, etc.
);
-- Example monitoring query
SELECT
layer_type,
AVG(query_duration_seconds) as avg_duration,
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY query_duration_seconds)
as p95_duration,
COUNT(*) as query_count
FROM query_performance_metrics
WHERE query_timestamp >= CURRENT_DATE - INTERVAL '7' DAY
GROUP BY layer_type;
By implementing a clear separation between raw and semantic layers,
you'll create a robust data architecture that maximizes Spartera's AI
capabilities while maintaining data quality, performance, and business
alignment. The semantic layer becomes your AI-ready interface, while
the raw layer provides the reliable foundation for all downstream
processing.
