# valohai.yaml
---
- step:
name: ingest_data
image: python:3.9
command:
- python ingest_data.py
outputs:
- name: dataset
path: dataset.csv
- step:
name: train_model
image: python:3.9
command:
- python train_model.py
inputs:
- name: dataset
default: datum://ingest_data/dataset.csv
outputs:
- name: model
path: model.pkl
- step:
name: evaluate_model
image: python:3.9
command:
- python evaluate_model.py
inputs:
- name: dataset
default: datum://ingest_data/dataset.csv
- name: model
default: datum://train_model/model.pkl
# ingest_data.py
import pandas as pd
def ingest_data():
df = pd.read_csv("data/dataset.csv")
df.to_csv("/valohai/outputs/dataset.csv", index=False)
if __name__ == "__main__":
ingest_data()
# train_model.py
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import pickle
def train_model():
df = pd.read_csv("/valohai/inputs/dataset.csv")
X, y = df.drop("target", axis=1), df["target"]
model = RandomForestRegressor(n_estimators=100)
model.fit(X, y)
with open("/valohai/outputs/model.pkl", "wb") as f:
pickle.dump(model, f)
if __name__ == "__main__":
train_model()
# evaluate_model.py
import pandas as pd
from sklearn.metrics import mean_squared_error
import pickle
def evaluate_model():
df = pd.read_csv("/valohai/inputs/dataset.csv")
with open("/valohai/inputs/model.pkl", "rb") as f:
model = pickle.load(f)
X, y = df.drop("target", axis=1), df["target"]
rmse = mean_squared_error(y, model.predict(X)) ** 0.5
print(f"RMSE: {rmse}")
if __name__ == "__main__":
evaluate_model()
# To run the pipeline:
# valohai execution run --adhoc ingest_data
# valohai execution run --adhoc train_model
# valohai execution run --adhoc evaluate_model