from clearml.automation.controller import PipelineDecorator
from clearml import TaskTypes
@PipelineDecorator.component(return_values=["data_frame"], cache=True, task_type=TaskTypes.data_processing)
def step_one(pickle_data_url: str, extra: int = 43):
print("step_one")
import sklearn # noqa
import pickle
import pandas as pd
from clearml import StorageManager
local_iris_pkl = StorageManager.get_local_copy(remote_url=pickle_data_url)
with open(local_iris_pkl, "rb") as f:
iris = pickle.load(f)
data_frame = pd.DataFrame(iris["data"], columns=iris["feature_names"])
data_frame.columns += ["target"]
data_frame["target"] = iris["target"]
return data_frame
@PipelineDecorator.component(
return_values=["X_train", "X_test", "y_train", "y_test"], cache=True, task_type=TaskTypes.data_processing
)
def step_two(data_frame, test_size=0.2, random_state=42):
print("step_two")
import pandas as pd # noqa
from sklearn.model_selection import train_test_split
y = data_frame["target"]
X = data_frame[(c for c in data_frame.columns if c != "target")]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
return X_train, X_test, y_train, y_test
@PipelineDecorator.component(return_values=["model"], cache=True, task_type=TaskTypes.training)
def step_three(X_train, y_train):
print("step_three")
import pandas as pd # noqa
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="liblinear", multi_class="auto")
model.fit(X_train, y_train)
return model
@PipelineDecorator.component(return_values=["accuracy"], cache=True, task_type=TaskTypes.qc)
def step_four(model, X_data, Y_data):
from sklearn.linear_model import LogisticRegression # noqa
from sklearn.metrics import accuracy_score
Y_pred = model.predict(X_data)
return accuracy_score(Y_data, Y_pred, normalize=True)
@PipelineDecorator.pipeline(name="custom pipeline logic", project="examples", version="0.0.5")
def executing_pipeline(pickle_url, mock_parameter="mock"):
print("pipeline args:", pickle_url, mock_parameter)
print("launch step one")
data_frame = step_one(pickle_url)
print("launch step two")
X_train, X_test, y_train, y_test = step_two(data_frame)
print("launch step three")
model = step_three(X_train, y_train)
print("returned model: {}".format(model))
print("launch step four")
accuracy = 100 * step_four(model, X_data=X_test, Y_data=y_test)
print(f"Accuracy={accuracy}%")
if __name__ == "__main__":
PipelineDecorator.run_locally()
executing_pipeline(
pickle_url="https://github.com/allegroai/events/raw/master/odsc20-east/generic/iris_dataset.pkl",
)
print("process completed")