TensorFlow 2.0 Beginer实现简单CNN

2024-07-02 04:50:19

我刚刚完成Coursera的DL专业化,我正在尝试用TensorFlow 2.0和我自己收集的数据实现一个CNN。我遵循tensorflow.org的指南和文档,能够设置一个管道来加载我的图像。然而,当我运行该模型时,我不断遇到与内存/资源相关的问题


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D

#Import helper modules
import os
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import os
import pandas as pd
import IPython.display as display
from PIL import Image

AUTOTUNE = tf.data.experimental.AUTOTUNE

#Define path to directories
train_dir = pathlib.Path.cwd() / 'train'
validation_dir = pathlib.Path.cwd() / 'validation'
test_dir = pathlib.Path.cwd() / 'test'

#Read csv file containing filename and label
train_csv = pd.read_csv(pathlib.Path.cwd() / 'train.csv')
validation_csv = pd.read_csv(pathlib.Path.cwd() / 'validation.csv')

#Define total number of training and validation set
total_train = train_csv.shape[0]
total_val = validation_csv.shape[0]
print(f'Total training images: {total_train}')
print(f'Total validation images: {total_val}')


CLASS_NAMES = np.array([item.name for item in train_dir.glob('*')])

#set up variables
TRAIN_STEPS_PER_EPOCH = np.ceil(total_train/BATCH_SIZE)
VAL_STEPS_PER_EPOCH = np.ceil(total_val/BATCH_SIZE)

#using td.data.Dataset
train_list_ds = tf.data.Dataset.list_files(str(train_dir/'*/*'))
valid_list_ds = tf.data.Dataset.list_files(str(validation_dir/'*/*'))

def get_label(file_path):
  # convert the path to a list of path components
  parts = tf.strings.split(file_path, os.path.sep)
  # The second to last is the class-directory
  return parts[-2] == CLASS_NAMES

def decode_img(img):
  # convert the compressed string to a 3D uint8 tensor
  img = tf.image.decode_jpeg(img, channels=3)
  # Use `convert_image_dtype` to convert to floats in the [0,1] range.
  img = tf.image.convert_image_dtype(img, tf.float32)
  # resize the image to the desired size.
  return tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH])

def process_path(file_path):
  label = get_label(file_path)
  # load the raw data from the file as a string
  img = tf.io.read_file(file_path)
  img = decode_img(img)
  return img, label

def show_batch(image_batch, label_batch):
  for n in range(25):
      ax = plt.subplot(5,5,n+1)

# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_labeled_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
valid_labeled_ds = valid_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)

def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
  # use `.cache(filename)` to cache preprocessing work for datasets that don't
  # fit in memory.
  if cache:
    if isinstance(cache, str):
      ds = ds.cache(cache)
      ds = ds.cache()

  ds = ds.shuffle(buffer_size=shuffle_buffer_size)

  # Repeat forever
  ds = ds.repeat()

  ds = ds.batch(BATCH_SIZE)

  # `prefetch` lets the dataset fetch batches in the background while the model
  # is training.
  ds = ds.prefetch(buffer_size=AUTOTUNE)

  return ds

train_ds = prepare_for_training(train_labeled_ds)
x_train, y_train = next(iter(train_ds))
valid_ds = prepare_for_training(valid_labeled_ds)


model = Sequential([
    Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
    Conv2D(32, 3, padding='same', activation='relu'),
    Conv2D(64, 3, padding='same', activation='relu'),
    Dense(512, activation='relu'),
    Dense(30, activation='softmax')



          epochs=15, batch_size= BATCH_SIZE, 
          validation_data=valid_ds.repeat(), steps_per_epoch= TRAIN_STEPS_PER_EPOCH,        

起初我遇到了输入数据用完了,所以我将输入数据集改为train\u ds.repeat(),而不是x=x\u train,y=y\u train


100/741 [===>..........................] - ETA: 18:07 - loss: 3.7188 - accuracy: 0.04942020-06-23 14:52:29.232604: E tensorflow/core/lib/jpeg/jpeg_mem.cc:323] Premature end of JPEG data. Stopped at line 910/1000
Traceback (most recent call last):
  File "product_detection.py", line 123, in <module>
    model.fit(train_ds.repeat(), epochs=15, batch_size= BATCH_SIZE, validation_data=valid_ds.repeat(), steps_per_epoch= TRAIN_STEPS_PER_EPOCH, validation_steps=VAL_STEPS_PER_EPOCH)
  File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
    tmp_logs = train_function(iterator)
  File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\def_function.py", line 580, in __call__
    result = self._call(*args, **kwds)
  File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\def_function.py", line 611, in _call
    return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
  File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 2420, in __call__
    return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
  File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 1665, in _filtered_call
  File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 1746, in _call_flat
    ctx, args, cancellation_manager=cancellation_manager))
  File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\function.py", line 598, in call
  File "C:\Users\sl199\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\eager\execute.py", line 60, in quick_execute
    inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  Invalid JPEG data or crop window, data size 101360
         [[{{node DecodeJpeg}}]]
  (1) Invalid argument:  Invalid JPEG data or crop window, data size 101360
         [[{{node DecodeJpeg}}]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1113]

Function call stack:
train_function -> train_function

2020-06-23 14:52:29.411288: W tensorflow/core/kernels/data/cache_dataset_ops.cc:794] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.

在这一点上,我完全不知所措,我怀疑我复制的prepare\u for\u training功能不适合我的应用程序,但我没有足够的理解来进行更改。它明确地说,它适用于1000多个图像的数据集,而我使用的是90k训练和10k验证数据集。我尝试更改批次大小,但问题仍然存在

我正在使用带有GTX 1050 Ti的TensorFlow gpu。我可以请你指点一下如何进行这项工作吗?先谢谢你


3537/9484 [==========>...................] - ETA: 12:31 - loss: 3.7519 - accuracy: 0.02602020-06-23 16:10:13.148245: W tensorflow/core/framework/op_kernel.cc:1753] OP_REQUIRES failed at cast_op.cc:109 : Resource exhausted: OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
2020-06-23 16:10:13.159005: W tensorflow/core/framework/op_kernel.cc:1753] OP_REQUIRES failed at cast_op.cc:109 : Resource exhausted: OOM when allocating tensor with shape[678,678,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
 Resou3538/9484 [==========>...................] - ETA: 12:31 - loss: 3.7519 - accuracy: 0.0260rce exhausted: OOM when allocating tensor with shape[956,956,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu

tensorflow.python.framework.errors_impl.ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
         [[{{node convert_image/Cast}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[943,943,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
         [[{{node convert_image/Cast}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

2024-07-02 04:50:19


tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  Invalid JPEG data or crop window, data size 101360
         [[{{node DecodeJpeg}}]]
  (1) Invalid argument:  Invalid JPEG data or crop window, data size 101360
         [[{{node DecodeJpeg}}]]


