...
Verzije
verzija | modul | python | Supek | Padobran |
---|---|---|---|---|
1.8.0 | scientific/pytorch/1.8.0-ngc | 3.8 | ||
1.14.0 | scientific/pytorch/1.14.0-ngc | 3.8 | ||
2.0.0 | scientific/pytorch/2.0.0 | 3.10 | ||
scientific/pytorch/2.0.0-ngc | 3.10 |
Note | ||
---|---|---|
| ||
Python aplikacije i knjižnice na Supeku su dostavljene u obliku kontejnera i zahtijevaju korištenje wrappera kao što je opisano ispod. Više informacija o python aplikacijama i kontejnerima na Supeku možete dobiti na sljedećim poveznicama: |
Dokumentacija
- Službena stranica - https://pytorch.org/
- Priručnik - https://pytorch.org/docs/stable/index.html
- distributed - https://pytorch.org/docs/stable/distributed.html
- torchrun
- accelerate
...
Supek
Ispod se nalaze primjeri pozivanja naredbi i aplikacija unutar kontejnera i aplikacija umjetnog benchmarka koji testira performanse na modelu Resnet50.
...
Code Block | ||||||||
---|---|---|---|---|---|---|---|---|
| ||||||||
# source
# - https://github.com/horovod/horovod/blob/master/examples/pytorch/pytorch_synthetic_benchmark.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from accelerate import Accelerator
from torchvision import models
from torch.utils.data import DataLoader
from torchvision.datasets import FakeData
from torchvision.transforms import ToTensor
import os
import sys
import time
import pprint
import socket
import numpy as np
def main():
# settings
epochs = 10
batch_size = 256
image_number = 256*30
model = 'resnet50'
# accelerator
accelerator = Accelerator()
# model
model = getattr(models, model)()
model.to(accelerator.device)
# optimizer
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_function = nn.CrossEntropyLoss()
# loader
data = FakeData(image_number,
num_classes=1000,
transform=ToTensor())
loader = DataLoader(data,
batch_size=batch_size)
# scheduler
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
# prepare
model, optimizer, loader, scheduler = accelerator.prepare(model,
optimizer,
loader,
scheduler)
# fit
for epoch in range(epochs):
start = time.time()
for batch, (images, labels) in enumerate(loader):
optimizer.zero_grad()
images = images.to(accelerator.device)
labels = labels.to(accelerator.device)
outputs = model(images)
classes = torch.argmax(outputs, dim=1)
loss = loss_function(outputs, labels)
accelerator.backward(loss)
optimizer.step()
scheduler.step()
if (batch%1 == 0) and ('RANK' not in os.environ or os.environ['RANK'] == '0'):
print('--- Epoch %2i, Batch %3i: Loss = %0.2f ---' % (epoch, batch, loss,))
if 'RANK' not in os.environ or os.environ['RANK'] == '0' :
end = time.time()
imgsec = image_number/(end-start)
print('--- Epoch %2i, Finished: %0.2f img/sec ---' % (epoch, imgsec))
if __name__ == '__main__':
main() |
Vrančić
Ispod se nalazi primjer aplikacije umjetnog benchmarka koji testira performanse na modelu Resnet50.
Code Block | ||||||||
---|---|---|---|---|---|---|---|---|
| ||||||||
#PBS -q cpu
#PBS -l ncpus=32
#PBS -l mem=50GB
# environment
module load scientific/pytorch/2.0.0
# set thread number to the cpu one
export OMP_NUM_THREADS=${NCPUS}
# run
cd ${PBS_O_WORKDIR:-""}
python singlenode.py |
Code Block | ||||||||
---|---|---|---|---|---|---|---|---|
| ||||||||
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.models import resnet50
from torchvision.datasets import FakeData
from torchvision.transforms import ToTensor
def main():
# vars
batch = 16
samples = 16*30
epochs = 3
# model
model = resnet50(weights=None)
optimizer = optim.SGD(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
# data
dataset = FakeData(samples,
num_classes=1000,
transform=ToTensor())
loader = DataLoader(dataset,
batch_size=batch,
shuffle=False,
num_workers=1,
pin_memory=True)
# train
for epoch in range(epochs):
start = time.time()
for batch, (images, labels) in enumerate(loader):
outputs = model(images)
classes = torch.argmax(outputs, dim=1)
loss = loss_fn(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (batch%10 == 0):
print('--- Epoch %i, Batch %3i / %3i, Loss = %0.2f ---' % (epoch,
batch,
len(loader),
loss.item()))
elapsed = time.time()-start
imgsec = samples/elapsed
print('--- Epoch %i finished: %0.2f img/sec ---' % (epoch,
imgsec))
if __name__ == "__main__":
main() |
Napomene
Tip | ||
---|---|---|
| ||
Ova knjižnica je dostavljena u obliku kontejnera, zbog opterećenja koje pip/conda virtualna okruženja stvaraju na Lustre dijeljenim datotečnim sustavima. Za ispravno izvršavanje python aplikacija ili naredbi koje se u njemu nalaze, potrebno je koristiti wrappere u skriptama sustava PBS:
Načini pozivanja wrappera opisani su u primjerima iznad. |
...