Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: run-singlegpu.sh & torchrun-singlenode.sh update

...

Ispod se nalaze primjeri aplikacija umjetnog benchmarka koji testira performanse na modelu Resnet50:

...

Jedan grafički procesor

Code Block
languagepy
titlesinglegpu.py
linenumberstrue
collapsetrue
# source
# - https://github.com/horovod/horovod/blob/master/examples/pytorch/pytorch_synthetic_benchmark.py

import argparse
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data.distributed
from torchvision import models

import sys
import time
import numpy as np

# Benchmark settings
parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark',
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-i",
                    "--images",
                    type=int,
                    help="image number",
                    default=1024)
parser.add_argument('--batch_size',
                    type=int,
                    default=32,
                    help='input batch size')
parser.add_argument("-e",
                    "--epochs",
                    type=int,
                    help="epochs",
                    default=101)
parser.add_argument('--model',
                    type=str,
                    default='resnet50',
                    help='model to benchmark')
args = parser.parse_args()

# model
model = getattr(models, args.model)()
model.cuda()

lr_scaler = 1
optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler)

cudnn.benchmark = True

# data
data = torch.randn(args.batch_size, 3, 224, 224)
target = torch.LongTensor(args.batch_size).random_() % 1000
data, target = data.cuda(), target.cuda()

# fit
def benchmark_step():
    optimizer.zero_grad()
    output = model(data)
    loss = F.cross_entropy(output, target)
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(args.epochs):
    begin = time.time()
    for batches in range(args.images//args.batch_size):
        loss = benchmark_step()
        if (batches%10 == 0):
            print('--- Epoch %2i, Batch %3i: Loss = %0.2f ---' % (epoch,
                                                                  batches,
                                                                  loss,))
    end = time.time()
    imgsec = args.images//(end-begin)
    print('--- Epoch %2i %ifinished: %0.2f img/sec ---' % (epoch, imgsec))

...

Code Block
languagebash
titlesinglegpu.sh
linenumberstrue
collapsetrue
#!/bin/bash

#PBS -q gpu
#PBS -l select=1:ncpus=32:ngpus=1
#PBS -o output/
#PBS -e output/

# pozovi modul
module load scientific/pytorch/1.14.0-ngc

# pomakni se u direktorij gdje se nalazi skripta
cd ${PBS_O_WORKDIR:-""}

# potjeraj skriptu korištenjem run-singlegpu.sh
run-singlenodesinglegpu.sh singlegpu.py \
  --images 25600 \
  --imagesbatch_size 10240256 \
  --epochs 1

Više grafičkih procesora na jednom čvoru

Code Block
languagepy
titlemultigpu-singlenode.py
linenumberstrue
collapsetrue
# source
# - https://pytorch.org/tutorials/intermediate/dist_tuto.html
# - https://pytorch.org/vision/main/generated/torchvision.datasets.FakeData.html
# - https://tuni-itc.github.io/wiki/Technical-Notes/Distributed_dataparallel_pytorch/#setting-up-the-same-model-with-distributeddataparallel

import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist

from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP

from torchvision.models import resnet50
from torchvision.datasets import FakeData
from torchvision.transforms import ToTensor

def main():

    # vars
    batch = 256
    samples = 25600
    epochs = 3

    # init
    dist.init_process_group("nccl")
    rank = dist.get_rank()
    ngpus = torch.cuda.device_count()

    # model
    model = resnet50(weights=None)
    model = model.to(rank)
    model = DDP(model, device_ids=[rank])
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    # data
    dataset = FakeData(samples,
                       num_classes=1000,
                       transform=ToTensor())
    sampler = DistributedSampler(dataset)
    loader = DataLoader(dataset,
                        batch_size=batch//ngpus,
                        sampler=sampler,
                        shuffle=False,
                        num_workers=2,
                        pin_memory=True,)

    # train
    for epoch in range(epochs):
        start = time.time()
        for batch, (images, labels) in enumerate(loader):
            images = images.to(rank)
            labels = labels.to(rank)
            outputs = model(images)
            classes = torch.argmax(outputs, dim=1)
            loss = loss_fn(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (rank == 0) and (batch%10 == 0):
                print('epoch: %3d, batch: %3d, loss: %0.4f' % (epoch+1,
                                                               batch,
                                                               loss.item()))
        if (rank == 0):
            elapsed = time.time()-start
            img_sec = samples/elapsed
            print('Epoch complete in %s seconds [%f img/sec] ' % (elapsed, img_sec))

    # clean
    dist.destroy_process_group()

if __name__ == "__main__":
    main()


Code Block
languagebash
titlemultigpu-singlenode.sh
linenumberstrue
collapsetrue
#!/bin/bash

#PBS -q gpu
#PBS -l ngpus=4
#PBS -l ncpus=16
#PBS -o output/
#PBS -e output/

# pozovi modul
module load scientific/pytorch/1.14.0-ngc

# pomakni se u direktorij gdje se nalazi skripta
cd ${PBS_O_WORKDIR:-""}

# potjeraj skriptu korištenjem torchrun-singlenode.sh
torchrun-singlenode.sh multigpu-singlenode.py

 256 \
    --epochs 5

Napomene

Warning
titleKorištenje više grafičkih procesora

PyTorch ne osigurava automatsko raspodjeljivanje računa na više grafičkih procesora.

Pri korištenju više procesora, potrebno je koristiti PyTorch sučelje distributed ili sučelja za distribuirano strojno učenje poput knjižnica Ray ili Dask.

U slučaju da vam je ova funkcionalnost prijeko potrebna, kontaktirajte nas na computing@srce.hr.


  • run-singlegpu.sh - korištenje samo jednog grafičkog procesora
  • torchrun-singlenode.sh - korištenje više grafičkih procesora na jednom čvoru

Primjere pozivanja wrappera možete naći u skriptama opisanima iznad

... run-singlenode.sh moja_python_skripta.py ..

.

Note
titleApptainer i run-singlenode.sh

Ova knjižnica je dostavljena u obliku kontejnera, zbog opterećenja koje pip/conda virtualna okruženja stvaraju na Lustre dijeljenim datotečnim sustavima.

Za ispravno izvršavanje python aplikacija, potrebno ih je koristiti wrapper run-singlenode.sh wrappere u skriptama sustava PBS:

Code Block