brave-yak-3559
08/07/2023, 12:16 AMbotocore.*exceptions*.ClientError: An *error* occurred (ValidationException) when calling the PutItem operation: Item size has exceeded the maximum allowed size
. the sample data i am working has 1.5 mil rows but for the real data the number of rows can be around 10 million. below is my code. any suggestion is appreciated. Thanks.
@project(name="sample")
class SampleFlow(FlowSpec):
@batch(image=DEFAULT_IMAGE, **{'cpu': 4, 'memory': 20480})
@step
def start(self):
import pandas as pd
with S3() as s3:
s3obj = s3.get('<s3://data.csv>')
df = pd.read_csv(s3obj.path)
self.rows=df.to_dict('records')
self.next(self.ml_model,foreach='rows')
@batch(image=DEFAULT_IMAGE, **{'cpu': 6, 'memory': 15000})
@step
def ml_model(self):
import MLModel
row = self.input
result = MLModel(row)
result = {**row, **result}
self.result = result
print(result)
self.next(self.join_results)
@batch(image=DEFAULT_IMAGE, **{'cpu': 6, 'memory': 180000})
@step
def join_results(self,inputs):
import pandas as pd
dict_list = [inp.result for inp in inputs]
self.processed_df = pd.DataFrame(dict_list)
self.next(self.end)