You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

124 lines
3.9 KiB
Python

from datetime import timedelta
import pandas as pd
from connectors.database import DatabaseConnector
from temporalio import workflow, activity
from temporalio.common import RetryPolicy
from typing import Dict, List
from config import source_config, target_config, config
from transformation.wo_transform import WoDataTransformer
# Activities
@activity.defn
async def fetch_data_activity(batch_size: int, last_id: int, columns) -> Dict:
db_connector = DatabaseConnector(source_config, target_config)
df = db_connector.fetch_batch(batch_size, last_id, columns)
return df.to_dict(orient='records')
@activity.defn
async def transform_data_activity(data: List[Dict]) -> List[Dict]:
transformer = WoDataTransformer()
df = pd.DataFrame(data)
transformed_df = transformer.transform(df)
return transformed_df.to_dict(orient='records')
@activity.defn
async def validate_data_activity(data: List[Dict]) -> bool:
transformer = WoDataTransformer()
df = pd.DataFrame(data)
return transformer.validate(df)
@activity.defn
async def load_data_activity(data: List[Dict], target_table: str) -> bool:
db_connector = DatabaseConnector(source_config, target_config)
df = pd.DataFrame(data)
return db_connector.load_batch(df, target_table)
# Workflow
@workflow.defn
class DataMigrationWorkflow:
def __init__(self):
self._total_processed = 0
self._last_id = 0
@workflow.run
async def run(self) -> Dict:
retry_policy = RetryPolicy(
initial_interval=timedelta(seconds=1),
maximum_interval=timedelta(minutes=10),
maximum_attempts=3
)
batch_size = config.get('batch_size', 1000)
target_table = config.get('target_table')
columns = config.get('columns')
while True:
# 1. Fetch batch
data = await workflow.execute_activity(
fetch_data_activity,
args=[batch_size, self._last_id, columns],
retry_policy=retry_policy,
start_to_close_timeout=timedelta(minutes=5)
)
if not data:
break
# 2. Transform data
transformed_data = await workflow.execute_activity(
transform_data_activity,
args=[data, config],
retry_policy=retry_policy,
start_to_close_timeout=timedelta(minutes=10)
)
# 3. Validate data
is_valid = await workflow.execute_activity(
validate_data_activity,
args=[transformed_data],
retry_policy=retry_policy,
start_to_close_timeout=timedelta(minutes=5)
)
if not is_valid:
raise ValueError(
f"Data validation failed for batch after ID {self._last_id}")
# 4. Load data
success = await workflow.execute_activity(
load_data_activity,
args=[transformed_data, target_table],
retry_policy=retry_policy,
start_to_close_timeout=timedelta(minutes=10)
)
if not success:
raise Exception(
f"Failed to load batch after ID {self._last_id}")
# Update progress
self._total_processed += len(data)
self._last_id = data[-1]['id']
# Record progress
# await workflow.execute_activity(
# record_progress_activity,
# args=[{
# 'last_id': self._last_id,
# 'total_processed': self._total_processed
# }],
# retry_policy=retry_policy,
# start_to_close_timeout=timedelta(minutes=1)
# )
return {
'total_processed': self._total_processed,
'last_id': self._last_id
}