0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-21 05:29:25 +01:00

chore(data-warehouse): Second attempt at rolling this out for some testing (#26162)

This commit is contained in:
Tom Owers 2024-11-14 09:36:56 +00:00 committed by GitHub
parent ee8d0f94f1
commit aafaea151e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 81 additions and 55 deletions

View File

@ -801,6 +801,8 @@ posthog/temporal/tests/batch_exports/test_batch_exports.py:0: error: TypedDict k
posthog/temporal/data_modeling/run_workflow.py:0: error: Dict entry 20 has incompatible type "str": "Literal['complex']"; expected "str": "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'binary', 'json', 'decimal', 'wei', 'date', 'time']" [dict-item]
posthog/temporal/data_modeling/run_workflow.py:0: error: Dict entry 21 has incompatible type "str": "Literal['complex']"; expected "str": "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'binary', 'json', 'decimal', 'wei', 'date', 'time']" [dict-item]
posthog/temporal/data_modeling/run_workflow.py:0: error: Dict entry 22 has incompatible type "str": "Literal['complex']"; expected "str": "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'binary', 'json', 'decimal', 'wei', 'date', 'time']" [dict-item]
posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: "FilesystemDestinationClientConfiguration" has no attribute "delta_jobs_per_write" [attr-defined]
posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: "type[FilesystemDestinationClientConfiguration]" has no attribute "delta_jobs_per_write" [attr-defined]
posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "DataWarehouseCredential | Combinable | None") [assignment]
posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "str | int | Combinable") [assignment]
posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "dict[str, dict[str, str | bool]] | dict[str, str]", variable has type "dict[str, dict[str, str]]") [assignment]

View File

@ -1,11 +1,13 @@
from dataclasses import dataclass
from typing import Literal
from typing import Any, Literal, Optional
from collections.abc import Iterator, Sequence
import uuid
import dlt
from django.conf import settings
from django.db.models import Prefetch
from dlt.pipeline.exceptions import PipelineStepFailed
from deltalake import DeltaTable
from posthog.settings.base_variables import TEST
from structlog.typing import FilteringBoundLogger
@ -14,6 +16,21 @@ from dlt.common.normalizers.naming.snake_case import NamingConvention
from dlt.common.schema.typing import TSchemaTables
from dlt.load.exceptions import LoadClientJobRetry
from dlt.sources import DltSource
from dlt.destinations.impl.filesystem.filesystem import FilesystemClient
from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration
from dlt.common.destination.reference import (
FollowupJobRequest,
)
from dlt.common.destination.typing import (
PreparedTableSchema,
)
from dlt.destinations.job_impl import (
ReferenceFollowupJobRequest,
)
from dlt.common.storages import FileStorage
from dlt.common.storages.load_package import (
LoadJobInfo,
)
from deltalake.exceptions import DeltaError
from collections import Counter
from clickhouse_driver.errors import ServerException
@ -25,7 +42,7 @@ from posthog.warehouse.models.external_data_job import ExternalDataJob
from posthog.warehouse.models.external_data_schema import ExternalDataSchema
from posthog.warehouse.models.external_data_source import ExternalDataSource
from posthog.warehouse.models.table import DataWarehouseTable
from posthog.temporal.data_imports.util import prepare_s3_files_for_querying
from posthog.temporal.data_imports.util import is_posthog_team, prepare_s3_files_for_querying
@dataclass
@ -100,51 +117,54 @@ class DataImportPipelineSync:
pipeline_name = self._get_pipeline_name()
destination = self._get_destination()
# def create_table_chain_completed_followup_jobs(
# self: FilesystemClient,
# table_chain: Sequence[PreparedTableSchema],
# completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None,
# ) -> list[FollowupJobRequest]:
# assert completed_table_chain_jobs is not None
# jobs = super(FilesystemClient, self).create_table_chain_completed_followup_jobs(
# table_chain, completed_table_chain_jobs
# )
# if table_chain[0].get("table_format") == "delta":
# for table in table_chain:
# table_job_paths = [
# job.file_path
# for job in completed_table_chain_jobs
# if job.job_file_info.table_name == table["name"]
# ]
# if len(table_job_paths) == 0:
# # file_name = ParsedLoadJobFileName(table["name"], "empty", 0, "reference").file_name()
# # TODO: if we implement removal od orphaned rows, we may need to propagate such job without files
# # to the delta load job
# pass
# else:
# files_per_job = self.config.delta_jobs_per_write or len(table_job_paths)
# for i in range(0, len(table_job_paths), files_per_job):
# jobs_chunk = table_job_paths[i : i + files_per_job]
# file_name = FileStorage.get_file_name_from_file_path(jobs_chunk[0])
# jobs.append(ReferenceFollowupJobRequest(file_name, jobs_chunk))
def create_table_chain_completed_followup_jobs(
self: FilesystemClient,
table_chain: Sequence[PreparedTableSchema],
completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None,
) -> list[FollowupJobRequest]:
assert completed_table_chain_jobs is not None
jobs = super(FilesystemClient, self).create_table_chain_completed_followup_jobs(
table_chain, completed_table_chain_jobs
)
if table_chain[0].get("table_format") == "delta":
for table in table_chain:
table_job_paths = [
job.file_path
for job in completed_table_chain_jobs
if job.job_file_info.table_name == table["name"]
]
if len(table_job_paths) == 0:
# file_name = ParsedLoadJobFileName(table["name"], "empty", 0, "reference").file_name()
# TODO: if we implement removal od orphaned rows, we may need to propagate such job without files
# to the delta load job
pass
else:
files_per_job = self.config.delta_jobs_per_write or len(table_job_paths)
for i in range(0, len(table_job_paths), files_per_job):
jobs_chunk = table_job_paths[i : i + files_per_job]
file_name = FileStorage.get_file_name_from_file_path(jobs_chunk[0])
jobs.append(ReferenceFollowupJobRequest(file_name, jobs_chunk))
# return jobs
return jobs
# def _iter_chunks(self, lst: list[Any], n: int) -> Iterator[list[Any]]:
# """Yield successive n-sized chunks from lst."""
# for i in range(0, len(lst), n):
# yield lst[i : i + n]
def _iter_chunks(self, lst: list[Any], n: int) -> Iterator[list[Any]]:
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i : i + n]
# Monkey patch to fix large memory consumption until https://github.com/dlt-hub/dlt/pull/2031 gets merged in
# if self._incremental or is_posthog_team(self.inputs.team_id):
# FilesystemDestinationClientConfiguration.delta_jobs_per_write = 1
# FilesystemClient.create_table_chain_completed_followup_jobs = create_table_chain_completed_followup_jobs # type: ignore
# FilesystemClient._iter_chunks = _iter_chunks # type: ignore
if (
is_posthog_team(self.inputs.team_id)
and str(self.inputs.source_id) == "01932521-63e7-0000-087c-527af4a2bc4d" # toms test source in prod
):
FilesystemDestinationClientConfiguration.delta_jobs_per_write = 1
FilesystemClient.create_table_chain_completed_followup_jobs = create_table_chain_completed_followup_jobs # type: ignore
FilesystemClient._iter_chunks = _iter_chunks # type: ignore
# dlt.config["data_writer.file_max_items"] = 500_000
# dlt.config["data_writer.file_max_bytes"] = 500_000_000 # 500 MB
# dlt.config["loader_parallelism_strategy"] = "table-sequential"
# dlt.config["delta_jobs_per_write"] = 1
dlt.config["data_writer.file_max_items"] = 500_000
dlt.config["data_writer.file_max_bytes"] = 500_000_000 # 500 MB
dlt.config["loader_parallelism_strategy"] = "table-sequential"
dlt.config["delta_jobs_per_write"] = 1
dlt.config["normalize.parquet_normalizer.add_dlt_load_id"] = True
dlt.config["normalize.parquet_normalizer.add_dlt_id"] = True
@ -173,21 +193,22 @@ class DataImportPipelineSync:
pipeline = self._create_pipeline()
# Workaround for full refresh schemas while we wait for Rust to fix memory issue
# if is_posthog_team(self.inputs.team_id):
# for name, resource in self.source._resources.items():
# if resource.write_disposition == "replace":
# try:
# delta_uri = f"{settings.BUCKET_URL}/{self.inputs.dataset_name}/{name}"
# delta_table = DeltaTable(delta_uri, storage_options=self._get_credentials())
# except TableNotFoundError:
# delta_table = None
if (
is_posthog_team(self.inputs.team_id)
and str(self.inputs.source_id) == "01932521-63e7-0000-087c-527af4a2bc4d" # toms test source in prod
):
for name, resource in self.source._resources.items():
if resource.write_disposition == "replace":
delta_uri = f"{settings.BUCKET_URL}/{self.inputs.dataset_name}/{name}"
storage_options = self._get_credentials()
# if delta_table:
# self.logger.debug("Deleting existing delta table")
# delta_table.delete()
if DeltaTable.is_deltatable(delta_uri, storage_options):
delta_table = DeltaTable(delta_uri, storage_options=self._get_credentials())
self.logger.debug("Deleting existing delta table")
delta_table.delete()
# self.logger.debug("Updating table write_disposition to append")
# resource.apply_hints(write_disposition="append")
self.logger.debug("Updating table write_disposition to append")
resource.apply_hints(write_disposition="append")
total_counts: Counter[str] = Counter({})

View File

@ -77,6 +77,7 @@ class TestDataImportPipeline(APIBaseTest):
) as mock_validate_schema_and_update_table,
patch("posthog.temporal.data_imports.pipelines.pipeline_sync.get_delta_tables"),
patch("posthog.temporal.data_imports.pipelines.pipeline_sync.update_last_synced_at_sync"),
patch("posthog.temporal.data_imports.pipelines.pipeline_sync.is_posthog_team", return_value=False),
):
pipeline = self._create_pipeline("Customer", False)
res = pipeline.run()
@ -98,6 +99,7 @@ class TestDataImportPipeline(APIBaseTest):
) as mock_validate_schema_and_update_table,
patch("posthog.temporal.data_imports.pipelines.pipeline_sync.get_delta_tables"),
patch("posthog.temporal.data_imports.pipelines.pipeline_sync.update_last_synced_at_sync"),
patch("posthog.temporal.data_imports.pipelines.pipeline_sync.is_posthog_team", return_value=False),
):
pipeline = self._create_pipeline("Customer", True)
res = pipeline.run()

View File

@ -202,6 +202,7 @@ async def _execute_run(workflow_id: str, inputs: ExternalDataWorkflowInputs, moc
),
mock.patch.object(AwsCredentials, "to_session_credentials", mock_to_session_credentials),
mock.patch.object(AwsCredentials, "to_object_store_rs_credentials", mock_to_object_store_rs_credentials),
mock.patch("posthog.temporal.data_imports.pipelines.pipeline_sync.is_posthog_team", return_value=False),
):
async with await WorkflowEnvironment.start_time_skipping() as activity_environment:
async with Worker(