Integrate the state-of-the-art pre-trained models from Hugging Face into your ZenML pipelines for powerful NLP and computer vision capabilities. Leverage the extensive Hugging Face model hub directly within your ML workflows, enabling efficient transfer learning and rapid prototyping.
from typing import Tuple
from zenml import pipeline, step
from zenml.integrations.huggingface.steps import run_with_accelerate
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
Trainer,
TrainingArguments,
DistilBertForSequenceClassification,
)
from datasets import load_dataset, Dataset
@step
def prepare_data() -> Tuple[Dataset, Dataset]: # Return any Huggingface dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
return (
tokenized_datasets["train"].shuffle(seed=42).select(range(1000)),
tokenized_datasets["test"].shuffle(seed=42).select(range(100)),
)
@run_with_accelerate(num_processes=4, multi_gpu=True) # Distribute workload with accelerate
@step(enable_cache=False)
def train_model(
train_dataset: Dataset, eval_dataset: Dataset
) -> DistilBertForSequenceClassification:
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased", num_labels=2
)
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
trainer.train()
return model # Return any HF model to track it
@pipeline
def fine_tuning_pipeline():
train_dataset, eval_dataset = prepare_data()
model = train_model(train_dataset, eval_dataset)
if __name__ == "__main__":
fine_tuning_pipeline()
This example demonstrates how to fine-tune a Hugging Face model within a ZenML pipeline for sentiment analysis on the IMDB dataset. The pipeline consists of two main steps:
- Data Preparation: The
prepare_data
step loads the IMDB dataset, tokenizes it using a DistilBERT tokenizer, and returns subsets of the train and test data. - Model Training: The
train_model
step initializes a DistilBERT model, sets up training arguments, and fine-tunes the model on the prepared data. It also includes evaluation and model saving functionality. This runs with the accelerate
library to distribute over 4 GPUs.
The fine_tuning_pipeline
function combines these steps into a cohesive workflow. This structure allows for easy modification and extension of the pipeline for different NLP tasks or models.