You are viewing an old version of this page. View the current version.
Compare with Current
View Page History
« Previous
Version 12
Next »
Opis
PyTorch je python knjižnica namijenjena razvoju aplikacija temeljenih na dubokom učenju koja se oslanja na ubrzanje grafičkim procesorima. Glavne prednosti koje PyTorch knjižnica pruža su imperativni pristup programiranju na "python" način, kroz sučelje koje omogućuje lakše otkrivanje grešaka i koje je prilagođeno postojećim python znanstvenim knjižnicama.
Verzije
verzija | modul | python | redovi |
---|
1.8.0 | scientific/pytorch/1.8.0-ngc | 3.8 | gpu gpu-test login-gpu |
1.14.0 | scientific/pytorch/1.14.0-ngc |
Dokumentacija
Primjeri
Ispod se nalaze primjeri pozivanja naredbi i aplikacija unutar kontejnera i aplikacija umjetnog benchmarka koji testira performanse na modelu Resnet50.
Pozivanje naredbi i aplikacija unutar kontejnera
[korisnik@x3000c0s25b0n0] $ module load scientific/pytorch/1.14.0-ngc
[korisnik@x3000c0s25b0n0] $ run-command.sh pip3 list
INFO: underlay of /etc/localtime required more than 50 (95) bind mounts
INFO: underlay of /usr/bin/nvidia-smi required more than 50 (474) bind mounts
13:4: not a valid test operator: (
13:4: not a valid test operator: 510.47.03
Package Version
----------------------- -------------------------------
absl-py 1.3.0
accelerate 0.19.0
apex 0.1
appdirs 1.4.4
argon2-cffi 21.3.0
argon2-cffi-bindings 21.2.0
asttokens 2.2.1
...
Aplikacija na jednom grafičkom procesoru
# source
# - https://github.com/horovod/horovod/blob/master/examples/pytorch/pytorch_synthetic_benchmark.py
import argparse
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data.distributed
from torchvision import models
import sys
import time
import numpy as np
# Benchmark settings
parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-i",
"--images",
type=int,
help="image number",
default=1024)
parser.add_argument('--batch_size',
type=int,
default=32,
help='input batch size')
parser.add_argument("-e",
"--epochs",
type=int,
help="epochs",
default=1)
parser.add_argument('--model',
type=str,
default='resnet50',
help='model to benchmark')
args = parser.parse_args()
# model
model = getattr(models, args.model)()
model.cuda()
lr_scaler = 1
optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler)
cudnn.benchmark = True
# data
data = torch.randn(args.batch_size, 3, 224, 224)
target = torch.LongTensor(args.batch_size).random_() % 1000
data, target = data.cuda(), target.cuda()
# fit
def benchmark_step():
optimizer.zero_grad()
output = model(data)
loss = F.cross_entropy(output, target)
loss.backward()
optimizer.step()
return loss.item()
for epoch in range(args.epochs):
begin = time.time()
for batches in range(args.images//args.batch_size):
loss = benchmark_step()
if (batches%10 == 0):
print('--- Epoch %2i, Batch %3i: Loss = %0.2f ---' % (epoch,
batches,
loss,))
end = time.time()
imgsec = args.images//(end-begin)
print('--- Epoch %2i finished: %0.2f img/sec ---' % (epoch, imgsec))
#!/bin/bash
#PBS -q gpu
#PBS -l ngpus=1
# pozovi modul
module load scientific/pytorch/1.14.0-ngc
# pomakni se u direktorij gdje se nalazi skripta
cd ${PBS_O_WORKDIR:-""}
# potjeraj skriptu korištenjem run-singlegpu.sh
run-singlegpu.sh singlegpu.py \
--images 25600 \
--batch_size 256 \
--epochs 1
Aplikacija na više grafičkih procesora i jednom čvoru
# source
# - https://pytorch.org/tutorials/intermediate/dist_tuto.html
# - https://pytorch.org/vision/main/generated/torchvision.datasets.FakeData.html
# - https://tuni-itc.github.io/wiki/Technical-Notes/Distributed_dataparallel_pytorch/#setting-up-the-same-model-with-distributeddataparallel
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torchvision.models import resnet50
from torchvision.datasets import FakeData
from torchvision.transforms import ToTensor
def main():
# vars
batch = 256
samples = 25600
epochs = 3
# init
dist.init_process_group("nccl")
rank = dist.get_rank()
ngpus = torch.cuda.device_count()
# model
model = resnet50(weights=None)
model = model.to(rank)
model = DDP(model, device_ids=[rank])
optimizer = optim.SGD(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
# data
dataset = FakeData(samples,
num_classes=1000,
transform=ToTensor())
sampler = DistributedSampler(dataset)
loader = DataLoader(dataset,
batch_size=batch//ngpus,
sampler=sampler,
shuffle=False,
num_workers=2,
pin_memory=True,)
# train
for epoch in range(epochs):
start = time.time()
for batch, (images, labels) in enumerate(loader):
images = images.to(rank)
labels = labels.to(rank)
outputs = model(images)
classes = torch.argmax(outputs, dim=1)
loss = loss_fn(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (rank == 0) and (batch%10 == 0):
print('epoch: %3d, batch: %3d, loss: %0.4f' % (epoch+1,
batch,
loss.item()))
if (rank == 0):
elapsed = time.time()-start
img_sec = samples/elapsed
print('Epoch complete in %s seconds [%f img/sec] ' % (elapsed, img_sec))
# clean
dist.destroy_process_group()
if __name__ == "__main__":
main()
#!/bin/bash
#PBS -q gpu
#PBS -l ngpus=4
#PBS -l ncpus=16
# pozovi modul
module load scientific/pytorch/1.14.0-ngc
# pomakni se u direktorij gdje se nalazi skripta
cd ${PBS_O_WORKDIR:-""}
# potjeraj skriptu korištenjem torchrun-singlenode.sh
torchrun-singlenode.sh multigpu-singlenode.py
Aplikacija na više grafičkih procesora i više čvorova
# source
# - https://pytorch.org/tutorials/intermediate/dist_tuto.html
# - https://pytorch.org/vision/main/generated/torchvision.datasets.FakeData.html
# - https://tuni-itc.github.io/wiki/Technical-Notes/Distributed_dataparallel_pytorch/#setting-up-the-same-model-with-distributeddataparallel
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torchvision.models import resnet50
from torchvision.datasets import FakeData
from torchvision.transforms import ToTensor
def main():
# vars
batch = 256
samples = 256*100
epochs = 3
# init
dist.init_process_group("nccl")
rank = int(os.environ['LOCAL_RANK'])
global_rank = int(os.environ['RANK'])
# model
model = resnet50(weights=None)
model = model.to(rank)
model = DDP(model, device_ids=[rank])
optimizer = optim.SGD(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
# data
dataset = FakeData(samples,
num_classes=1000,
transform=ToTensor())
sampler = DistributedSampler(dataset)
loader = DataLoader(dataset,
batch_size=batch,
sampler=sampler,
shuffle=False,
num_workers=1,
pin_memory=True,)
# train
for epoch in range(epochs):
start = time.time()
for batch, (images, labels) in enumerate(loader):
images = images.to(rank)
labels = labels.to(rank)
outputs = model(images)
classes = torch.argmax(outputs, dim=1)
loss = loss_fn(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (global_rank == 0) and (batch%10 == 0):
print('epoch: %3d, batch: %3d/%3d, loss: %0.4f' % (epoch+1,
batch,
len(loader),
loss.item()))
if (global_rank == 0):
elapsed = time.time()-start
img_sec = samples/elapsed
print('Epoch complete in %0.2f seconds [%0.2f img/sec] ' % (elapsed, img_sec))
# clean
dist.destroy_process_group()
if __name__ == "__main__":
main()
#!/bin/bash
#PBS -q gpu
#PBS -l select=8:ngpus=1:ncpus=4
# pozovi module
module load scientific/pytorch/1.14.0-ngc
module load cray-pals
# pomakni se u direktorij gdje se nalazi skripta
cd ${PBS_O_WORKDIR:-""}
# potjeraj skriptu korištenjem torchrun-multinode.sh
mpiexec --cpu-bind none torchrun-multinode.sh multigpu-multinode.py
Napomene