Pytorch (CIFAR10)

官方文档 中文 https://www.pytorchtutorial.com/docs/

官方文档 https://pytorch.org/docs/stable/index.html

1. Structure

2 Code

train_model.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import torch
from torch import nn


# 搭建神经网络(output_features=10)
class Classification10Class(nn.Module):
def __init__(self):
super(Classification10Class, self).__init__()
self.module = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2),
nn.MaxPool2d(kernel_size=2),
nn.Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2),
nn.MaxPool2d(kernel_size=2),
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
nn.MaxPool2d(kernel_size=2),
nn.Flatten(),
nn.Linear(in_features=64 * 4 * 4, out_features=64),
nn.Linear(in_features=64, out_features=10),
)

def forward(self, x):
x = self.module(x)
return x


# 验证网络正确性
if __name__ == '__main__':
classification = Classification10Class()
# 按照batch_size=64,channel=3,size=32*32输入,即64张图片,RGB颜色通道,32*32的大小输入
inputs = torch.ones((64, 3, 32, 32))
ouputs = classification(inputs)
print(ouputs.shape)

train.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
""" CIFAR Dataset """
import torchvision.datasets
from torch.utils.data import DataLoader

from train_model import *

# 1. 准备数据集
train_data = torchvision.datasets.CIFAR10(
root="./dataset",
train=True,
transform=torchvision.transforms.ToTensor(),
download=True,
)

test_data = torchvision.datasets.CIFAR10(
root="./dataset",
train=False,
transform=torchvision.transforms.ToTensor(),
download=True,
)

# 2. 获取数据集长度
train_data_size = len(train_data)
test_data_size = len(test_data)
# print("训练数据集长度为 {}".format(train_data_size))
# print("测试数据集长度为 {}".format(test_data_size))


# 3. 利用DataLoader加载数据集
train_dataloader = DataLoader(
dataset=train_data,
batch_size=64,
)

test_dataloader = DataLoader(
dataset=test_data,
batch_size=64,
)

# 4. 搭建神经网络 (from train_model.py import *)
classification = Classification10Class()

# 5. 损失函数
loss_fn = nn.CrossEntropyLoss()

# 6. 优化器
learning_rate = 0.01
optimizer = torch.optim.SGD(
params=classification.parameters(),
lr=learning_rate,
)

# 7. 设置训练网络的参数
total_train_step = 0 # 训练次数
total_test_step = 0 # 测试次数
epochs = 10 # 训练迭代次数

# 8. 开始训练
for epoch in range(epochs):
print("----------第 {} 轮训练开始----------".format(epoch + 1))

# 训练步骤
for data in train_dataloader:
# 输入输出
images, targets = data
outputs = classification(images)

# 损失函数
loss = loss_fn(outputs, targets)

# 清零梯度
optimizer.zero_grad()

# 反向传播
loss.backward()

# 更新参数
optimizer.step()

total_train_step += 1
if total_train_step % 100 == 0:
print("训练次数: {}, loss: {}".format(total_train_step, loss.item()))

# 测试步骤(不更新参数)
total_test_loss = 0 # 测试集损失累积
with torch.no_grad():
for data in test_dataloader:
images, targets = data
outputs = classification(images)
loss = loss_fn(outputs, targets)
total_test_loss += loss

print("测试集loss: {}".format(total_test_loss))

output

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
----------第 1 轮训练开始----------
训练次数: 100, loss: 2.2914345264434814
训练次数: 200, loss: 2.2848589420318604
训练次数: 300, loss: 2.2572102546691895
训练次数: 400, loss: 2.1692259311676025
训练次数: 500, loss: 2.0409679412841797
训练次数: 600, loss: 2.0187602043151855
训练次数: 700, loss: 2.009617567062378
测试集loss: 313.149169921875
----------第 2 轮训练开始----------
训练次数: 800, loss: 1.878823161125183
训练次数: 900, loss: 1.8439174890518188
训练次数: 1000, loss: 1.9330165386199951
训练次数: 1100, loss: 1.9703041315078735
训练次数: 1200, loss: 1.7066203355789185
训练次数: 1300, loss: 1.668871521949768
训练次数: 1400, loss: 1.7355754375457764
训练次数: 1500, loss: 1.7841742038726807
测试集loss: 309.877685546875
----------第 3 轮训练开始----------
训练次数: 1600, loss: 1.7344536781311035
训练次数: 1700, loss: 1.621435284614563
...

3 Visualization & Save

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
""" CIFAR Dataset """
import torchvision.datasets
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from train_model import *

# 1. 准备数据集
train_data = torchvision.datasets.CIFAR10(
root="./dataset",
train=True,
transform=torchvision.transforms.ToTensor(),
download=True,
)

test_data = torchvision.datasets.CIFAR10(
root="./dataset",
train=False,
transform=torchvision.transforms.ToTensor(),
download=True,
)

# 2. 获取数据集长度
train_data_size = len(train_data)
test_data_size = len(test_data)
# print("训练数据集长度为 {}".format(train_data_size))
# print("测试数据集长度为 {}".format(test_data_size))


# 3. 利用DataLoader加载数据集
train_dataloader = DataLoader(
dataset=train_data,
batch_size=64,
)

test_dataloader = DataLoader(
dataset=test_data,
batch_size=64,
)

# 4. 搭建神经网络 (from train_model.py import *)
classification = Classification10Class()

# 5. 损失函数
loss_fn = nn.CrossEntropyLoss()

# 6. 优化器
learning_rate = 0.01
optimizer = torch.optim.SGD(
params=classification.parameters(),
lr=learning_rate,
)

# 7. 设置训练网络的参数
total_train_step = 0 # 训练次数
total_test_step = 0 # 测试次数
epochs = 10 # 训练迭代次数

# 添加tensorboard可视化
writer = SummaryWriter("./logs")

# 8. 开始训练
for epoch in range(epochs):
print("------------- 第 {} 轮训练开始 -------------".format(epoch + 1))

# 训练步骤
for data in train_dataloader:
# 输入输出
images, targets = data
outputs = classification(images)

# 损失函数
loss = loss_fn(outputs, targets)

# 清零梯度
optimizer.zero_grad()

# 反向传播
loss.backward()

# 更新参数
optimizer.step()

total_train_step += 1
if total_train_step % 100 == 0:
print("训练次数: {}, loss: {}".format(total_train_step, loss.item()))
writer.add_scalar(
tag="train_loss (every 100): ",
scalar_value=loss.item(),
global_step=total_train_step,
)

# 测试步骤(不更新参数)
total_test_loss = 0 # 测试集损失累积
with torch.no_grad():
for data in test_dataloader:
images, targets = data
outputs = classification(images)
loss = loss_fn(outputs, targets)
total_test_loss += loss

print("##### 测试集loss: {} #####".format(total_test_loss))
writer.add_scalar(
tag="test_loss (every epoch): ",
scalar_value=total_test_loss,
global_step=epoch,
)

# 保存每次训练的模型
torch.save(classification, "./models_cifar/classification_{}.pth".format(epoch))
print("##### 模型成功保存 #####")

writer.close()

4 Full Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
""" CIFAR Classification """
import torchvision.datasets
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from train_model import *

# 1. 准备数据集
train_data = torchvision.datasets.CIFAR10(
root="./dataset",
train=True,
transform=torchvision.transforms.ToTensor(),
download=True,
)

test_data = torchvision.datasets.CIFAR10(
root="./dataset",
train=False,
transform=torchvision.transforms.ToTensor(),
download=True,
)

# 2. 获取数据集长度
train_data_size = len(train_data)
test_data_size = len(test_data)
# print("训练数据集长度为 {}".format(train_data_size))
# print("测试数据集长度为 {}".format(test_data_size))


# 3. 利用DataLoader加载数据集
train_dataloader = DataLoader(
dataset=train_data,
batch_size=64,
)

test_dataloader = DataLoader(
dataset=test_data,
batch_size=64,
)

# 4. 搭建神经网络 (from train_model.py import *)
classification = Classification10Class()

# 5. 损失函数
loss_fn = nn.CrossEntropyLoss()

# 6. 优化器
learning_rate = 0.01
optimizer = torch.optim.SGD(
params=classification.parameters(),
lr=learning_rate,
)

# 7. 设置训练网络的参数
total_train_step = 0 # 训练次数
total_test_step = 0 # 测试次数 == epoch
epochs = 10 # 训练迭代次数

# 添加tensorboard可视化
writer = SummaryWriter("./logs")

# 8. 开始训练
for epoch in range(epochs):
print("------------- 第 {} 轮训练开始 -------------".format(epoch + 1))

# 训练步骤
classification.train()
for data in train_dataloader:
# 输入输出
images, targets = data
outputs = classification(images)

# 损失函数
loss = loss_fn(outputs, targets)

# 清零梯度
optimizer.zero_grad()

# 反向传播
loss.backward()

# 更新参数
optimizer.step()

total_train_step += 1
if total_train_step % 100 == 0:
print("训练次数: {}, loss: {}".format(total_train_step, loss.item()))
writer.add_scalar(
tag="train_loss (every 100 steps)",
scalar_value=loss.item(),
global_step=total_train_step,
)

# 测试步骤(不更新参数)
classification.eval()
total_test_loss = 0 # 测试集损失累积
total_accuracy = 0 # 分类问题正确率
with torch.no_grad():
for data in test_dataloader:
images, targets = data
outputs = classification(images)
loss = loss_fn(outputs, targets)
total_test_loss += loss.item()

# 正确率
accuracy = (outputs.argmax(axis=1) == targets).sum()
total_accuracy += accuracy

# 在测试集上的损失
print("##### 在测试集上的loss: {} #####".format(total_test_loss))
writer.add_scalar(
tag="test_loss (every epoch)",
scalar_value=total_test_loss,
global_step=epoch,
)

# 在测试集上的正确率
print("##### 在测试集上的正确率: {} #####".format(total_accuracy / test_data_size))
writer.add_scalar(
tag="test_accuracy (every epoch)",
scalar_value=total_accuracy / test_data_size,
global_step=epoch,
)

# 保存每次训练的模型
torch.save(classification, "./models_cifar/classification_{}.pth".format(epoch))
# torch.save(classification.state_dict(), "./models_cifar/classification_{}.pth".format(epoch)) # 推荐
print("##### 模型成功保存 #####")

writer.close()

5 GPU

1. Google Colab

  • 配置

  • cpu 大约42s gpu 大约8s

2. Method 1: .cuda()

1
2
3
4
5
6
7
8
9
10
''' GPU '''
if torch.cuda.is_available():
model_name = model_name.cuda() # GPU

if torch.cuda.is_available():
loss_fn = loss_fn.cuda() # GPU

if torch.cuda.is_available():
images = images.cuda()
targets = targets.cuda()

3. Method 2: .to(device)

1
2
3
4
5
6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = model_name.to(device)
loss_fn = loss_fn.to(device)
images = images.to(device)
targets = targets.to(device)

6 Validation and Test

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import torchvision.transforms
from PIL import Image
import torch
from torch import nn

image_path = "./image/dog.png"

# 1. 加载图片并转化类型
image = Image.open(image_path)
image = image.convert("RGB")
transform = torchvision.transforms.Compose([
torchvision.transforms.Resize((32, 32)),
torchvision.transforms.ToTensor(),
])
image = transform(image) # torch.Size([3, 32, 32])


# 2. 加载神经网络(因为是方式1, 所以要申明网络模型)
class Classification10Class(nn.Module):
def __init__(self):
super(Classification10Class, self).__init__()
self.module = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2),
nn.MaxPool2d(kernel_size=2),
nn.Conv2d(in_channels=32, out_channels=32, kernel_size=5, stride=1, padding=2),
nn.MaxPool2d(kernel_size=2),
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, stride=1, padding=2),
nn.MaxPool2d(kernel_size=2),
nn.Flatten(),
nn.Linear(in_features=64 * 4 * 4, out_features=64),
nn.Linear(in_features=64, out_features=10),
)

def forward(self, x):
x = self.module(x)
return x


# 3. load参数
model = torch.load("./models_cifar/classification_gpu_29.pth", map_location=torch.device('cpu'))

# 4. 测试
image = torch.reshape(image, (1, 3, 32, 32))

model.eval()
with torch.no_grad():
outputs = model(image)

print(outputs)
print(outputs.argmax(axis=1))