from datetime import timedelta import pandas as pd from connectors.database import DatabaseConnector from temporalio import workflow, activity from temporalio.common import RetryPolicy from typing import Dict, List from config import source_config, target_config, config from transformation.wo_transform import WoDataTransformer # Activities @activity.defn async def fetch_data_activity(batch_size: int, last_id: int, columns) -> Dict: db_connector = DatabaseConnector(source_config, target_config) df = db_connector.fetch_batch(batch_size, last_id, columns) return df.to_dict(orient='records') @activity.defn async def transform_data_activity(data: List[Dict]) -> List[Dict]: transformer = WoDataTransformer() df = pd.DataFrame(data) transformed_df = transformer.transform(df) return transformed_df.to_dict(orient='records') @activity.defn async def validate_data_activity(data: List[Dict]) -> bool: transformer = WoDataTransformer() df = pd.DataFrame(data) return transformer.validate(df) @activity.defn async def load_data_activity(data: List[Dict], target_table: str) -> bool: db_connector = DatabaseConnector(source_config, target_config) df = pd.DataFrame(data) return db_connector.load_batch(df, target_table) # Workflow @workflow.defn class DataMigrationWorkflow: def __init__(self): self._total_processed = 0 self._last_id = 0 @workflow.run async def run(self) -> Dict: retry_policy = RetryPolicy( initial_interval=timedelta(seconds=1), maximum_interval=timedelta(minutes=10), maximum_attempts=3 ) batch_size = config.get('batch_size', 1000) target_table = config.get('target_table') columns = config.get('columns') while True: # 1. Fetch batch data = await workflow.execute_activity( fetch_data_activity, args=[batch_size, self._last_id, columns], retry_policy=retry_policy, start_to_close_timeout=timedelta(minutes=5) ) if not data: break # 2. Transform data transformed_data = await workflow.execute_activity( transform_data_activity, args=[data, config], retry_policy=retry_policy, start_to_close_timeout=timedelta(minutes=10) ) # 3. Validate data is_valid = await workflow.execute_activity( validate_data_activity, args=[transformed_data], retry_policy=retry_policy, start_to_close_timeout=timedelta(minutes=5) ) if not is_valid: raise ValueError( f"Data validation failed for batch after ID {self._last_id}") # 4. Load data success = await workflow.execute_activity( load_data_activity, args=[transformed_data, target_table], retry_policy=retry_policy, start_to_close_timeout=timedelta(minutes=10) ) if not success: raise Exception( f"Failed to load batch after ID {self._last_id}") # Update progress self._total_processed += len(data) self._last_id = data[-1]['id'] # Record progress # await workflow.execute_activity( # record_progress_activity, # args=[{ # 'last_id': self._last_id, # 'total_processed': self._total_processed # }], # retry_policy=retry_policy, # start_to_close_timeout=timedelta(minutes=1) # ) return { 'total_processed': self._total_processed, 'last_id': self._last_id }