gray-activity-77385
07/07/2023, 4:21 PMkmeans_flow_v1.py
from metaflow import FlowSpec, step, Parameter
class KmeansFlow(FlowSpec):
num_docs = Parameter('num-docs', help='Number of documents', default=1000)
@step
def start(self):
import scale_data
scale_data.load_yelp_reviews(self.num_docs)
self.next(self.end)
@step
def end(self):
pass
if __name__ == '__main__':
KmeansFlow()
It uses the load_yelp_reviews
function from the following python module: scale_data.py
import tarfile
from itertools import islice
from metaflow import S3
def load_yelp_reviews(num_docs):
with S3() as s3:
res = s3.get('<s3://fast-ai-nlp/yelp_review_full_csv.tgz>')
with tarfile.open(res.path) as tar:
datafile = tar.extractfile('yelp_review_full_csv/train.csv')
return list(islice(datafile, num_docs))
def make_matrix(docs, binary=False):
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(min_df=10, max_df=0.1, binary=binary)
mtx = vec.fit_transform(docs)
cols = [None] * len(vec.vocabulary_)
for word, idx in vec.vocabulary_.items():
cols[idx] = word
return mtx, cols
On executing python3 kmeans_flow_v1.py --with batch
the tasks start running fine and then exit, stating that S3 access has been denied. I would appreciate any help or suggestions to solve the issue. I'm confused whether it's an issue of the public fast-ai s3 bucket or that of the deployed AWS Batch compute cluster.