我刚刚完成Coursera的DL专业化,我正在尝试用TensorFlow 2.0和我自己收集的数据实现一个CNN。我遵循tensorflow.org的指南和文档,能够设置一个管道来加载我的图像。然而,当我运行该模型时,我不断遇到与内存/资源相关的问题
我的模型应该使用30个类别进行多标签分类。下面是我的代码:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
#Import helper modules
import os
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import os
import pandas as pd
import IPython.display as display
from PIL import Image
AUTOTUNE = tf.data.experimental.AUTOTUNE
#Define path to directories
train_dir = pathlib.Path.cwd() / 'train'
validation_dir = pathlib.Path.cwd() / 'validation'
test_dir = pathlib.Path.cwd() / 'test'
#Read csv file containing filename and label
train_csv = pd.read_csv(pathlib.Path.cwd() / 'train.csv')
validation_csv = pd.read_csv(pathlib.Path.cwd() / 'validation.csv')
#Define total number of training and validation set
total_train = train_csv.shape[0]
total_val = validation_csv.shape[0]
print(f'Total training images: {total_train}')
print(f'Total validation images: {total_val}')
下面的代码来自https://www.tensorflow.org/tutorials/load_data/images
CLASS_NAMES = np.array([item.name for item in train_dir.glob('*')])
#set up variables
BATCH_SIZE = 128
TRAIN_STEPS_PER_EPOCH = np.ceil(total_train/BATCH_SIZE)
VAL_STEPS_PER_EPOCH = np.ceil(total_val/BATCH_SIZE)
IMG_HEIGHT = 150
IMG_WIDTH = 150
#using td.data.Dataset
train_list_ds = tf.data.Dataset.list_files(str(train_dir/'*/*'))
valid_list_ds = tf.data.Dataset.list_files(str(validation_dir/'*/*'))
def get_label(file_path):
# convert the path to a list of path components
parts = tf.strings.split(file_path, os.path.sep)
# The second to last is the class-directory
return parts[-2] == CLASS_NAMES
def decode_img(img):
# convert the compressed string to a 3D uint8 tensor
img = tf.image.decode_jpeg(img, channels=3)
# Use `convert_image_dtype` to convert to floats in the [0,1] range.
img = tf.image.convert_image_dtype(img, tf.float32)
# resize the image to the desired size.
return tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH])
def process_path(file_path):
label = get_label(file_path)
# load the raw data from the file as a string
img = tf.io.read_file(file_path)
img = decode_img(img)
return img, label
def show_batch(image_batch, label_batch):
plt.figure(figsize=(10,10))
for n in range(25):
ax = plt.subplot(5,5,n+1)
plt.imshow(image_batch[n])
plt.title(CLASS_NAMES[label_batch[n]==1][0].title())
plt.axis('off')
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_labeled_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
valid_labeled_ds = valid_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
# use `.cache(filename)` to cache preprocessing work for datasets that don't
# fit in memory.
if cache:
if isinstance(cache, str):
ds = ds.cache(cache)
else:
ds = ds.cache()
ds = ds.shuffle(buffer_size=shuffle_buffer_size)
# Repeat forever
ds = ds.repeat()
ds = ds.batch(BATCH_SIZE)
# `prefetch` lets the dataset fetch batches in the background while the model
# is training.
ds = ds.prefetch(buffer_size=AUTOTUNE)
return ds
train_ds = prepare_for_training(train_labeled_ds)
x_train, y_train = next(iter(train_ds))
valid_ds = prepare_for_training(valid_labeled_ds)
该模型来自教程:https://www.tensorflow.org/tutorials/images/classification
model = Sequential([
Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
MaxPooling2D(),
Conv2D(32, 3, padding='same', activation='relu'),
MaxPooling2D(),
Conv2D(64, 3, padding='same', activation='relu'),
MaxPooling2D(),
Flatten(),
Dense(512, activation='relu'),
Dense(30, activation='softmax')
])
model.compile(optimizer='adam',
loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
model.summary()
model.fit(train_ds.repeat(),
epochs=15, batch_size= BATCH_SIZE,
validation_data=valid_ds.repeat(), steps_per_epoch= TRAIN_STEPS_PER_EPOCH,
validation_steps=VAL_STEPS_PER_EPOCH)
起初我遇到了输入数据用完了,所以我将输入数据集改为train\u ds.repeat(),而不是x=x\u train,y=y\u train
我遇到的下一个问题是
100/741 [===>..........................] - ETA: 18:07 - loss: 3.7188 - accuracy: 0.04942020-06-23 14:52:29.232604: E tensorflow/core/lib/jpeg/jpeg_mem.cc:323] Premature end of JPEG data. Stopped at line 910/1000
Traceback (most recent call last):
File "product_detection.py", line 123, in <module>
model.fit(train_ds.repeat(), epochs=15, batch_size= BATCH_SIZE, validation_data=valid_ds.repeat(), steps_per_epoch= TRAIN_STEPS_PER_EPOCH, validation_steps=VAL_STEPS_PER_EPOCH)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
tmp_logs = train_function(iterator)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
result = self._call(*args, **kwds)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 2420, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 1665, in _filtered_call
self.captured_inputs)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 1746, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 598, in call
ctx=ctx)
File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\execute.py", line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: Invalid JPEG data or crop window, data size 101360
[[{{node DecodeJpeg}}]]
[[IteratorGetNext]]
(1) Invalid argument: Invalid JPEG data or crop window, data size 101360
[[{{node DecodeJpeg}}]]
[[IteratorGetNext]]
[[IteratorGetNext/_2]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1113]
Function call stack:
train_function -> train_function
2020-06-23 14:52:29.411288: W tensorflow/core/kernels/data/cache_dataset_ops.cc:794] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
在这一点上,我完全不知所措,我怀疑我复制的prepare\u for\u training功能不适合我的应用程序,但我没有足够的理解来进行更改。它明确地说,它适用于1000多个图像的数据集,而我使用的是90k训练和10k验证数据集。我尝试更改批次大小,但问题仍然存在
我正在使用带有GTX 1050 Ti的TensorFlow gpu。我可以请你指点一下如何进行这项工作吗?先谢谢你
编辑1:将我的批次大小更改为10,出现此错误
3537/9484 [==========>...................] - ETA: 12:31 - loss: 3.7519 - accuracy: 0.02602020-06-23 16:10:13.148245: W tensorflow/core/framework/op_kernel.cc:1753] OP_REQUIRES failed at cast_op.cc:109 : Resource exhausted: OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
2020-06-23 16:10:13.159005: W tensorflow/core/framework/op_kernel.cc:1753] OP_REQUIRES failed at cast_op.cc:109 : Resource exhausted: OOM when allocating tensor with shape[678,678,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
Resou3538/9484 [==========>...................] - ETA: 12:31 - loss: 3.7519 - accuracy: 0.0260rce exhausted: OOM when allocating tensor with shape[956,956,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
tensorflow.python.framework.errors_impl.ResourceExhaustedError: 2 root error(s) found.
(0) Resource exhausted: OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
[[{{node convert_image/Cast}}]]
[[IteratorGetNext]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[IteratorGetNext/_2]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
(1) Resource exhausted: OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
[[{{node convert_image/Cast}}]]
[[IteratorGetNext]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
根据本节的输出:
似乎如果某些数据已损坏,请检查数据并删除损坏的条目,这是一个常见错误,您可以检查HERE和THERE
相关问题 更多 >
编程相关推荐