--How to check pandas related using gokart --The first is to check if the input pd.Dataframe ends normally when it is empty. ――The second is to check if each column has the expected type when dumping.
--OSS developed by M3 and fringe81 --Wraps luigi developed by Spotify to make it easier to use. Especially the amount of code to write is reduced.
--The code below causes an error when pd.Dataframe is empty. ――It is a prerequisite to write unit tests, but there were many cases where I could not pick them up.
class DataTask(gokart.TaskOnKart):
task_namespace = 'sample'
def run(self):
df = pd.DataFrame(dict(user=[1, 2], item=['a', 'b']))
self.dump(df)
class TaskA(gokart.TaskOnKart):
task_namespace = 'sample'
def requires(self):
return DataTask()
def run(self):
df = self.load_data_frame()
#When df is empty`KeyError`Occurs.
df['user'] = df['user'].apply(lambda x: f'user_{x}')
self.dump(df)
if __name__ == '__main__':
gokart.run()
--You can check if it works even if it is empty by executing the following.
--Add --test-run-pandas
and --test-run-namespace = sample
.
$ python main.py sample.TaskA --local-scheduler --test-run-pandas --test-run-namespace=sample
--Message
gokart test results:
status=OK; namespace=sample; name=DataTask; id=10f87ddcf3df71d786a023ae5e0bbc98;
status=NG; namespace=sample; name=TaskA; id=44e9690a4d2182a9bed6b6d9730291bd; message=<class 'KeyError'>: user
--Check if the pandas type has changed due to an unintended operation.
--You can check for each task, but define a combination of column_name and type for each namespace.
--Inherit gokart.PandasTypeConfig
'and define the rules in the namespace.
class SamplePandasTypeCheck(gokart.PandasTypeConfig):
task_namespace = 'sample'
@classmethod
def type_dict(cls) -> Dict[str, Any]:
return {'user': int}
class DataTask(gokart.TaskOnKart):
task_namespace = 'sample'
def run(self):
df = pd.DataFrame(dict(user=[1, 2], item=['a', 'b']))
self.dump(df)
class TaskA(gokart.TaskOnKart):
task_namespace = 'sample'
def requires(self):
return DataTask()
def run(self):
df = self.load_data_frame()
#The type of user has changed from int to str.
df['user'] = df['user'].apply(lambda x: f'user_{x}')
self.dump(df)
if __name__ == '__main__':
gokart.run()
--If you execute task normally, the type will be checked automatically.
--In this case, TaskA
fails because ʻuser has been changed to
str`.
$ python main.py sample.TaskA --local-scheduler
--Message
gokart.pandas_type_config.PandasTypeError: expected type is "<class 'int'>", but "<class 'str'>" is passed in column "user".
...
===== Luigi Execution Summary =====
Scheduled 2 tasks of which:
* 1 ran successfully:
- 1 sample.DataTask(...)
* 1 failed:
- 1 sample.TaskA(...)
This progress looks :( because there were failed tasks
===== Luigi Execution Summary =====
Recommended Posts