You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
124 lines
3.9 KiB
Python
124 lines
3.9 KiB
Python
from datetime import timedelta
|
|
import pandas as pd
|
|
from connectors.database import DatabaseConnector
|
|
from temporalio import workflow, activity
|
|
from temporalio.common import RetryPolicy
|
|
from typing import Dict, List
|
|
from config import source_config, target_config, config
|
|
from transformation.wo_transform import WoDataTransformer
|
|
# Activities
|
|
|
|
|
|
@activity.defn
|
|
async def fetch_data_activity(batch_size: int, last_id: int, columns) -> Dict:
|
|
db_connector = DatabaseConnector(source_config, target_config)
|
|
df = db_connector.fetch_batch(batch_size, last_id, columns)
|
|
return df.to_dict(orient='records')
|
|
|
|
|
|
@activity.defn
|
|
async def transform_data_activity(data: List[Dict]) -> List[Dict]:
|
|
transformer = WoDataTransformer()
|
|
df = pd.DataFrame(data)
|
|
transformed_df = transformer.transform(df)
|
|
return transformed_df.to_dict(orient='records')
|
|
|
|
|
|
@activity.defn
|
|
async def validate_data_activity(data: List[Dict]) -> bool:
|
|
transformer = WoDataTransformer()
|
|
df = pd.DataFrame(data)
|
|
return transformer.validate(df)
|
|
|
|
|
|
@activity.defn
|
|
async def load_data_activity(data: List[Dict], target_table: str) -> bool:
|
|
db_connector = DatabaseConnector(source_config, target_config)
|
|
df = pd.DataFrame(data)
|
|
return db_connector.load_batch(df, target_table)
|
|
|
|
# Workflow
|
|
|
|
|
|
@workflow.defn
|
|
class DataMigrationWorkflow:
|
|
def __init__(self):
|
|
self._total_processed = 0
|
|
self._last_id = 0
|
|
|
|
@workflow.run
|
|
async def run(self) -> Dict:
|
|
retry_policy = RetryPolicy(
|
|
initial_interval=timedelta(seconds=1),
|
|
maximum_interval=timedelta(minutes=10),
|
|
maximum_attempts=3
|
|
)
|
|
|
|
batch_size = config.get('batch_size', 1000)
|
|
target_table = config.get('target_table')
|
|
columns = config.get('columns')
|
|
|
|
while True:
|
|
# 1. Fetch batch
|
|
data = await workflow.execute_activity(
|
|
fetch_data_activity,
|
|
args=[batch_size, self._last_id, columns],
|
|
retry_policy=retry_policy,
|
|
start_to_close_timeout=timedelta(minutes=5)
|
|
)
|
|
|
|
if not data:
|
|
break
|
|
|
|
# 2. Transform data
|
|
transformed_data = await workflow.execute_activity(
|
|
transform_data_activity,
|
|
args=[data, config],
|
|
retry_policy=retry_policy,
|
|
start_to_close_timeout=timedelta(minutes=10)
|
|
)
|
|
|
|
# 3. Validate data
|
|
is_valid = await workflow.execute_activity(
|
|
validate_data_activity,
|
|
args=[transformed_data],
|
|
retry_policy=retry_policy,
|
|
start_to_close_timeout=timedelta(minutes=5)
|
|
)
|
|
|
|
if not is_valid:
|
|
raise ValueError(
|
|
f"Data validation failed for batch after ID {self._last_id}")
|
|
|
|
# 4. Load data
|
|
success = await workflow.execute_activity(
|
|
load_data_activity,
|
|
args=[transformed_data, target_table],
|
|
retry_policy=retry_policy,
|
|
start_to_close_timeout=timedelta(minutes=10)
|
|
)
|
|
|
|
if not success:
|
|
raise Exception(
|
|
f"Failed to load batch after ID {self._last_id}")
|
|
|
|
# Update progress
|
|
self._total_processed += len(data)
|
|
self._last_id = data[-1]['id']
|
|
|
|
# Record progress
|
|
# await workflow.execute_activity(
|
|
# record_progress_activity,
|
|
# args=[{
|
|
# 'last_id': self._last_id,
|
|
# 'total_processed': self._total_processed
|
|
# }],
|
|
# retry_policy=retry_policy,
|
|
# start_to_close_timeout=timedelta(minutes=1)
|
|
# )
|
|
|
|
return {
|
|
'total_processed': self._total_processed,
|
|
'last_id': self._last_id
|
|
}
|