08-加载数据集

pytorch里面Dataset是一个构造的数据集,DataLoader用来加载数据集。

构造数据集

  • Dataset是一个抽象类,需要定义一个类来继承。

  • DataLoader是用来导入数据的。

构造框架

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class DiabetesDataset(Dataset):
    def __init__(self,):
        pass
    
    def __getitem__(self,index): # dataset[index]
        return
    
    def __len__(self): # 数据数目
        pass
    
dataset = DiabetesDataset()
train_loader = DataLoader(dataset=dataset,
					batch_size=32,shuffle=True,num_workers=2)

定义数据集

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np

class DiabetesDataset(Dataset):
    def __init__(self,filepath):
        xy = np.loadtxt(filepath,delimiter=',',dtype=np.float32)
        self.len = xy.shape[0]
        self.x_data = torch.from_numpy(xy[:,-1])
        self.y_data = torch.from_numpy(xy[:,[-1]])
    
    def __getitem__(self,index): # dataset[index]
        return self.x_data[index],self.y_data[index]
    
    def __len__(self): # 数据数目
        return self.len
    
dataset = DiabetesDataset('./数据集/diabetes/diabetes.csv.gz')
train_loader = DataLoader(dataset=dataset,
                          batch_size=32,
                          shuffle=True,num_workers=2)

训练数据

class Model(torch.nn.Module):
    
    def __init__(self,):
        super(Model,self).__init__()
        self.linear1 = torch.nn.Linear(8,6)
        self.linear2 = torch.nn.Linear(6,4)
        self.linear3 = torch.nn.Linear(4,1)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self,x):
        x = self.sigmoid(self.linear1(x))
        x = self.sigmoid(self.linear2(x))
        x = self.sigmoid(self.linear3(x))
        
        return x
model = Model()
criterion = torch.nn.BCELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(),lr=0.1)

for epoch in range(100):
    for i,data in enumerate(train_loader,0):
        # prepare data
        x,y = data
        # forward
        y_pred = model(x)
        loss = criterion(y_pred,y)
        print(epoch,i,loss.item())
        # backward
        optimizer.zero_grad()
        loss.backward()
        # update
        optimizer.step()

运行后出现如下错误:

 RuntimeError: DataLoader worker (pid(s) 20700, 7588) exited unexpectedly 

因为torch.utils.data.DataLoader中设置了num_works=2,也就是多线程读取。pytorchWindows下的多线程读取好像有点问题,将num_workers改为0。

if __name__ == '__main__':
    for epoch in range(10):
        for i,data in enumerate(train_loader,0):
            pass
            # prepare data
            x,y = data
            # forward
            y_pred = model(x)
            loss = criterion(y_pred,y)
            print(epoch,i,loss.item())
            # backward
            optimizer.zero_grad()
            loss.backward()
            # update
            optimizer.step()

例子

  • 手写数字集

image-20210303115228526

最后更新于

这有帮助吗?