import paddle
import numpy as np
paddle.__version__
'2.0.0-rc1'
print('視覺相關數據集:', paddle.vision.datasets.__all__)
print('自然語言相關數據集:', paddle.text.datasets.__all__)
視覺相關數據集: ['DatasetFolder', 'ImageFolder', 'MNIST', 'FashionMNIST', 'Flowers', 'Cifar10', 'Cifar100', 'VOC2012']
自然語言相關數據集: ['Conll05st', 'Imdb', 'Imikolov', 'Movielens', 'UCIHousing', 'WMT14', 'WMT16']
import paddle.vision as vision
print("訓練集下載中...")
# 訓練數據集
train_dataset = vision.datasets.MNIST(mode='train')
print("訓練集下載完成!")
print("測試集下載中...")
# 驗證數據集
test_dataset = vision.datasets.MNIST(mode='test')
print("測試集下載完成!")
訓練集下載中...
Cache file /home/aistudio/.cache/paddle/dataset/mnist/train-images-idx3-ubyte.gz not found, downloading https://dataset.bj.bcebos.com/mnist/train-images-idx3-ubyte.gz
Begin to download
Download finished
Cache file /home/aistudio/.cache/paddle/dataset/mnist/train-labels-idx1-ubyte.gz not found, downloading https://dataset.bj.bcebos.com/mnist/train-labels-idx1-ubyte.gz
Begin to download
........
Download finished
訓練集下載完成!
測試集下載中...
Cache file /home/aistudio/.cache/paddle/dataset/mnist/t10k-images-idx3-ubyte.gz not found, downloading https://dataset.bj.bcebos.com/mnist/t10k-images-idx3-ubyte.gz
Begin to download
Download finished
Cache file /home/aistudio/.cache/paddle/dataset/mnist/t10k-labels-idx1-ubyte.gz not found, downloading https://dataset.bj.bcebos.com/mnist/t10k-labels-idx1-ubyte.gz
Begin to download
..
Download finished
測試集下載完成!
import numpy as np
import matplotlib.pyplot as plt
train_data_0, train_label_0 = np.array(train_dataset[0][0]), train_dataset[0][1]
train_data_0 = train_data_0.reshape([28, 28])
plt.figure(figsize=(2, 2))
plt.imshow(train_data_0, cmap=plt.cm.binary)
print('train_data0 label is: ' + str(train_label_0))
train_data0 label is: [5]
class MyDataset(paddle.io.IterableDataset):
"""
步驟一:繼承paddle.io.IterableDataset類
"""
def __init__(self, mode='train'):
"""
步驟二:實現構造函數,定義數據讀取方式,劃分訓練和測試數據集
"""
super(MyDataset, self).__init__()
if mode == 'train':
self.data = [
['train_image_0.jpg', '1'],
['train_image_1.jpg', '2'],
['train_image_2.jpg', '3'],
['train_image_3.jpg', '4'],
]
else:
self.data = [
['test_image_0.jpg', '1'],
['test_image_1.jpg', '2'],
['test_image_2.jpg', '3'],
['test_image_3.jpg', '4'],
]
def _load_img(self, image_path):
# 實際使用時使用Pillow相關庫進行圖片讀取即可,這裡我們對數據先做個模擬
image = np.random.randn(32, 32, 3)
return image
def __getitem__(self, index):
"""
步驟三:實現__getitem__方法,定義指定index時如何獲取數據,並返回單條數據(訓練數據,對應的標籤)
"""
image = self._load_img(self.data[index][0])
label = self.data[index][1]
return image, np.array(label, dtype='int64')
def __len__(self):
"""
步驟四:實現__len__方法,返回數據集總數目
"""
return len(self.data)
# 測試定義的數據集
train_dataset = MyDataset(mode='train')
test_dataset = MyDataset(mode='test')
print('=============train dataset=============')
for image, label in train_dataset:
print('image shape: {}, label: {}'.format(image.shape, label))
print('=============evaluation dataset=============')
for image, label in test_dataset:
print('image shape: {}, label: {}'.format(image.shape, label))
=============train dataset=============
image shape: (32, 32, 3), label: 1
image shape: (32, 32, 3), label: 2
image shape: (32, 32, 3), label: 3
image shape: (32, 32, 3), label: 4
=============evaluation dataset=============
image shape: (32, 32, 3), label: 1
image shape: (32, 32, 3), label: 2
image shape: (32, 32, 3), label: 3
image shape: (32, 32, 3), label: 4
# 下載訓練集
!wget http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
# 下載訓練集標籤
!wget http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
# 下載測試集
!wget http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
# 下載測試集標籤
!wget http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
import os
import gzip
class FashionMNISTDataset(paddle.io.Dataset):
"""
步驟一:繼承paddle.io.Dataset類
"""
def __init__(self, path='./', mode='train'):
"""
步驟二:實現構造函數,定義數據讀取方式,劃分訓練和測試數據集
"""
super(FashionMNISTDataset, self).__init__()
images_data_path = os.path.join(path,
'%s-images-idx3-ubyte.gz'
% mode)
labels_data_path = os.path.join(path,
'%s-labels-idx1-ubyte.gz'
% mode)
with gzip.open(labels_data_path, 'rb') as lbpath:
self.labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
offset=8)
with gzip.open(images_data_path, 'rb') as imgpath:
self.images = np.frombuffer(imgpath.read(), dtype=np.uint8,
offset=16).reshape(len(self.labels), 784)
def __getitem__(self, index):
"""
步驟三:實現__getitem__方法,定義指定index時如何獲取數據,並返回單條數據(訓練數據,對應的標籤)
"""
image = self.images[index]
label = self.labels[index]
return image, label
def __len__(self):
"""
步驟四:實現__len__方法,返回數據集總數目
"""
return len(self.images)
# 測試定義的數據集
fashion_mnist_train_dataset = FashionMNISTDataset(mode='train')
fashion_mnist_test_dataset = FashionMNISTDataset(mode='t10k')
# 可視化訓練集
fashion_mnist_train_data_0 = np.array(fashion_mnist_train_dataset[0][0])
fashion_mnist_train_label_0 = fashion_mnist_train_dataset[0][1]
fashion_mnist_train_data_0 = fashion_mnist_train_data_0.reshape([28,28])
plt.figure(figsize=(2,2))
plt.imshow(fashion_mnist_train_data_0, cmap=plt.cm.binary)
print('train_data0 label is: ' + str(fashion_mnist_train_label_0))
train_data0 label is: 9
import math
import paddle
import numpy as np
from paddle.io import IterableDataset, DataLoader, get_worker_info
class SplitedIterableDataset(IterableDataset):
"""
步驟一:繼承paddle.io.Dataset類
"""
def __init__(self, start, end):
self.start = start
self.end = end
def __iter__(self):
"""
步驟二:實現__iter__方法,
"""
worker_info = get_worker_info()
if worker_info is None:
iter_start = self.start
iter_end = self.end
else:
per_worker = int(
math.ceil((self.end - self.start) / float(
worker_info.num_workers)))
worker_id = worker_info.id
iter_start = self.start + worker_id * per_worker
iter_end = min(iter_start + per_worker, self.end)
for i in range(iter_start, iter_end):
yield np.array([i])
dataset = SplitedIterableDataset(start=2, end=9)
dataloader = DataLoader(dataset, num_workers=2, batch_size=1, drop_last=True)
for data in dataloader:
print(data[0].numpy())
[[2]]
[[6]]
[[3]]
[[7]]
[[4]]
[[8]]
[[5]]
from paddle.io import TensorDataset
input_np = np.random.random([2, 3, 4]).astype('float32')
input_tensor = paddle.to_tensor(input_np)
label_np = np.random.random([2, 1]).astype('int32')
label_tensor = paddle.to_tensor(label_np)
dataset = TensorDataset([input_tensor, label_tensor])
for i in range(len(dataset)):
input, label = dataset[i]
print(input, label)
Tensor(shape=[3, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
[[0.91451722, 0.94088864, 0.52030772, 0.80783033],
[0.74379814, 0.18669823, 0.41893899, 0.89299613],
[0.67413408, 0.82801068, 0.02079745, 0.95862854]]) Tensor(shape=[1], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
[0])
Tensor(shape=[3, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
[[0.30733261, 0.82390237, 0.99652219, 0.93594497],
[0.62558615, 0.83836132, 0.34213212, 0.72257715],
[0.80075997, 0.38913822, 0.25709155, 0.00520579]]) Tensor(shape=[1], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
[0])
print("飛槳支持的數據預處理方式:" + str(paddle.vision.transforms.__all__))
飛槳支持的數據預處理方式:['BaseTransform', 'Compose', 'Resize', 'RandomResizedCrop', 'CenterCrop', 'RandomHorizontalFlip', 'RandomVerticalFlip', 'Transpose', 'Normalize', 'BrightnessTransform', 'SaturationTransform', 'ContrastTransform', 'HueTransform', 'ColorJitter', 'RandomCrop', 'Pad', 'RandomRotation', 'Grayscale', 'ToTensor', 'to_tensor', 'hflip', 'vflip', 'resize', 'pad', 'rotate', 'to_grayscale', 'crop', 'center_crop', 'adjust_brightness', 'adjust_contrast', 'adjust_hue', 'normalize']
import paddle.vision.transforms as T
# 方式一 只對圖像進行調整亮度的操作
transform = T.BrightnessTransform(0.4)
# 通過transform參數傳遞定義好的數據增方法即可完成對自帶數據集的數據增強
train_dataset_without_transform = vision.datasets.Cifar10(mode='train')
train_dataset_with_transform = vision.datasets.Cifar10(mode='train', transform=transform)
index = 10
print("未調整亮度的圖像")
train_dataset_without_data_0 = np.array(train_dataset_without_transform[index][0])
train_dataset_without_data_0 = train_dataset_without_data_0.astype('float32') / 255.
plt.imshow(train_dataset_without_data_0)
未調整亮度的圖像
<matplotlib.image.AxesImage at 0x7fb13e129090>
print("調整亮度的圖像")
train_dataset_with_data_0 = np.array(train_dataset_with_transform[index][0])
train_dataset_with_data_0 = train_dataset_with_data_0.astype('float32') / 255.
plt.imshow(train_dataset_with_data_0)
調整亮度的圖像
<matplotlib.image.AxesImage at 0x7fb19b1b5f90>
import paddle.vision.transforms as T
# 方式二 對圖像進行多種操作
transform = T.Compose([T.BrightnessTransform(0.4), T.ContrastTransform(0.4)])
# 通過transform參數傳遞定義好的數據增方法即可完成對自帶數據集的數據增強
train_dataset_without_compose = vision.datasets.Cifar10(mode='train')
train_dataset_with_compose = vision.datasets.Cifar10(mode='train', transform=transform)
index = 10
print("未調整的圖像")
train_dataset_without_compose_data_0 = np.array(train_dataset_without_compose[index][0])
train_dataset_without_compose_data_0 = train_dataset_without_compose_data_0.astype('float32') / 255.
plt.imshow(train_dataset_without_compose_data_0)
未調整的圖像
<matplotlib.image.AxesImage at 0x7fb13065fb90>
print("多種調整後的圖像")
train_dataset_with_compose_data_0 = np.array(train_dataset_with_compose[index][0])
train_dataset_with_compose_data_0 = train_dataset_with_compose_data_0.astype('float32') / 255.
plt.imshow(train_dataset_with_compose_data_0)
多種調整後的圖像
<matplotlib.image.AxesImage at 0x7fb1b818c610>
class FashionMNISTDataset(paddle.io.Dataset):
"""
步驟一:繼承paddle.io.Dataset類
"""
def __init__(self, path='./', mode='train', transform='None'):
"""
步驟二:實現構造函數,定義數據讀取方式,劃分訓練和測試數據集
"""
super(FashionMNISTDataset, self).__init__()
images_data_path = os.path.join(path,
'%s-images-idx3-ubyte.gz'
% mode)
labels_data_path = os.path.join(path,
'%s-labels-idx1-ubyte.gz'
% mode)
with gzip.open(labels_data_path, 'rb') as lbpath:
self.labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
offset=8)
with gzip.open(images_data_path, 'rb') as imgpath:
self.images = np.frombuffer(imgpath.read(), dtype=np.uint8,
offset=16).reshape(len(self.labels), 784)
self.transform = None
if transform != 'None':
self.transform = transform
def __getitem__(self, index):
"""
步驟三:實現__getitem__方法,定義指定index時如何獲取數據,並返回單條數據(訓練數據,對應的標籤)
"""
if self.transform:
image = self.transform(self.images[index].reshape(28, 28))
else:
image = self.images[index]
label = self.labels[index]
return image, label
def __len__(self):
"""
步驟四:實現__len__方法,返回數據集總數目
"""
return len(self.images)
# 測試未處理的數據集
fashion_mnist_train_dataset_without_transform = FashionMNISTDataset(mode='train')
# 可視化
fashion_mnist_train_dataset_without_transform = np.array(fashion_mnist_train_dataset_without_transform[0][0])
fashion_mnist_train_dataset_without_transform = fashion_mnist_train_dataset_without_transform.reshape([28, 28])
plt.imshow(fashion_mnist_train_dataset_without_transform, cmap=plt.cm.binary)
<matplotlib.image.AxesImage at 0x7fb130421ed0>
# 測試處理的數據集
from paddle.vision.transforms import RandomVerticalFlip
fashion_mnist_train_dataset_with_transform = FashionMNISTDataset(mode='train', transform=RandomVerticalFlip(0.4))
# 可視化
fashion_mnist_train_dataset_with_transform = np.array(fashion_mnist_train_dataset_with_transform[0][0])
fashion_mnist_train_dataset_with_transform = fashion_mnist_train_dataset_with_transform.reshape([28, 28])
plt.imshow(fashion_mnist_train_dataset_with_transform, cmap=plt.cm.binary)
<matplotlib.image.AxesImage at 0x7fb130367b50>
train_loader = paddle.io.DataLoader(train_dataset, batch_size=64, shuffle=True)
for batch_id, data in enumerate(train_loader()):
x_data = data[0]
y_data = data[1]
print(x_data.numpy().shape)
print(y_data.numpy().shape)
break
(4, 32, 32, 3)
(4,)
from paddle.io import SequenceSampler, RandomSampler, BatchSampler, DistributedBatchSampler
class RandomDataset(paddle.io.Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([784]).astype('float32')
label = np.random.randint(0, 9, (1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
train_dataset = RandomDataset(100)
print('--順序採樣-')
sampler = SequenceSampler(train_dataset)
batch_sampler = BatchSampler(sampler=sampler, batch_size=10)
for index in batch_sampler:
print(index)
print('--隨機採樣-')
sampler = RandomSampler(train_dataset)
batch_sampler = BatchSampler(sampler=sampler, batch_size=10)
for index in batch_sampler:
print(index)
print('--分布式採樣-')
batch_sampler = DistributedBatchSampler(train_dataset, num_replicas=2, batch_size=10)
for index in batch_sampler:
print(index)
--順序採樣-
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
[70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
[80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
[90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
--隨機採樣-
[9, 7, 54, 93, 84, 14, 12, 46, 67, 72]
[10, 57, 32, 61, 38, 71, 63, 51, 37, 11]
[21, 76, 69, 22, 48, 88, 19, 59, 47, 60]
[89, 85, 31, 80, 91, 30, 50, 52, 39, 3]
[70, 45, 62, 75, 35, 8, 96, 94, 5, 98]
[49, 33, 28, 13, 18, 42, 90, 0, 36, 79]
[81, 15, 6, 78, 40, 86, 2, 23, 95, 43]
[87, 65, 68, 25, 99, 26, 73, 82, 1, 53]
[77, 29, 17, 44, 55, 4, 56, 64, 97, 83]
[66, 41, 16, 74, 92, 34, 27, 24, 58, 20]
--分布式採樣-
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
[80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
飛槳(PaddlePaddle)以百度多年的深度學習技術研究和業務應用為基礎,是中國首個開源開放、技術領先、功能完備的產業級深度學習平臺,包括飛槳開源平臺和飛槳企業版。飛槳開源平臺包含核心框架、基礎模型庫、端到端開發套件與工具組件,持續開源核心能力,為產業、學術、科研創新提供基礎底座。飛槳企業版基於飛槳開源平臺,針對企業級需求增強了相應特性,包含零門檻AI開發平臺EasyDL和全功能AI開發平臺BML。EasyDL主要面向中小企業,提供零門檻、預置豐富網絡和模型、便捷高效的開發平臺;BML是為大型企業提供的功能全面、可靈活定製和被深度集成的開發平臺。