Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
197 views
in Technique[技术] by (71.8m points)

python - Tensorflow-tfrecords: Error while passing loaded tfrecords to model.fit()

I'm trying to load tfrecords to boost training speed. My tfrecord contains extracted features of audio files and it's corresponding label. To reproduce the same, I have generated a random NumPy array of size (50,50) and label(0 or 1) and saved as tfrecord and read them for training.

tensorflow version: 2.3.0

here is my sample code,

Generate TFrecord

from pathlib import Path
import tensorflow as tf
import numpy as np
import os
import shutil
from tensorflow.keras.optimizers import SGD, Adadelta,Adam, Nadam, RMSprop 
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.layers import Conv1D,Conv2D, Dense, Flatten, MaxPool1D,MaxPool2D, Dropout, BatchNormalization, Input, MaxPooling1D, Activation, Concatenate, SeparableConv1D
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, Bidirectional, GRU, LSTM, TimeDistributed, ConvLSTM2D, SimpleRNN, AveragePooling1D
from tensorflow.keras.layers import Reshape, Lambda, Dot, Softmax, LocallyConnected1D, LayerNormalization, add
from tensorflow.keras.models import Model, Sequential, load_model

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    # If the value is an eager tensor BytesList won't unpack a string from an EagerTensor.
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() 
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def create_example(label, audio_feature):
    feature = {
      'label': _int64_feature(label),
      'audio_feature': _bytes_feature(audio_feature),
    }
    # Create a Features message using tf.train.Example.
    example_proto =  tf.train.Example(features=tf.train.Features(feature=feature))

    return example_proto

def serialize_example(audio_feature, label):
    feature = {
      'audio_feature': _bytes_feature(audio_feature),
        'label': _int64_feature(label),
    }
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

        
def genrate_tf_records(tf_records_dir, length):
    cnt = 1
    if os.path.exists(tf_records_dir):
            shutil.rmtree(tf_records_dir)
            print(f"Removed old directory...({tf_records_dir})")
    print("Creating new tf_record directory...")
    Path(tf_records_dir).mkdir(parents = True, exist_ok = True)

    file_path = os.path.join(tf_records_dir, "tfdata.tfrecord")
    with tf.io.TFRecordWriter(file_path) as writer:
        for fn in range(length):
            audio_feature, label = np.random.randn(50,50), np.random.choice([0,1])
            serialized_example = serialize_example(tf.io.serialize_tensor(audio_feature), label)    
            writer.write(serialized_example)

                
 tf_records_train_dir ="./tf_features_aug_train/"
 tf_records_val_dir ="./tf_features_aug_val/"
 genrate_tf_records(tf_records_train_dir, length=1000)
 genrate_tf_records(tf_records_val_dir, length=100)

Read TFrecord

AUTOTUNE = tf.data.experimental.AUTOTUNE

def _parse_batch(record_batch):
    feature_description = {
        'audio_feature': tf.io.FixedLenFeature((), tf.string),
        'label': tf.io.FixedLenFeature((), tf.int64),
    }
    example = tf.io.parse_example(record_batch, feature_description)

    audio_feature = tf.io.parse_tensor(example['audio_feature'], out_type = tf.float64)
    #audio_feature = tf.reshape(audio_feature, (input_shape[0],input_shape[1]))
    label = example['label']
    return audio_feature, label

def get_dataset_from_tfrecords(tfrecords_dir='tfrecords', mode='train', n_epochs=1):
   
    # List all *.tfrecord files for the selected split
    files_ds = tf.data.Dataset.list_files(str(tfrecords_dir)+"*.tfrecord")

    # Disregard data order in favor of reading speed
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = False
    files_ds = files_ds.with_options(ignore_order)


    ds = tf.data.TFRecordDataset(files_ds, num_parallel_reads=AUTOTUNE)

    # Parse a batch into a dataset of [audio, label] pairs
    ds = ds.map(lambda x: _parse_batch(x))
    
    for data, label in ds.take(1):
        print(np.shape(data.numpy()))
        print(label.numpy())

    return ds.prefetch(buffer_size=AUTOTUNE)


train_ds = get_dataset_from_tfrecords(tfrecords_dir=tf_records_train_dir, mode='train')
val_ds   = get_dataset_from_tfrecords(tfrecords_dir=tf_records_val_dir, mode='val')

I'm able to load the saved tfrecords and can parse the data. But when I pass the TensorFlow object to model.fit() we get an error.

Create model

class Model_Creator():
    def getmodel(self, model_name, input_shape, numclass):
        #import pdb;pdb.set_trace()
        if model_name in dir(self) and callable(getattr(self, model_name)):
            print(model_name, 'from ACK.py')
            model = getattr(self, model_name)(input_shape, numclass) 
        else:
            print(model_name, 'from ign_utils/models_audio_ign.py')
            model =  Model_Creator2().getmodel( model_name, input_shape, numclass)  
        print('Created ',model_name)
        return model 
    
    def cnn_model(self, input_shape, numclass):
        datainp = Input(shape=input_shape)
        x=datainp
        #x = Dropout(0.1)(x)
        
        x = Conv1D(filters=32, kernel_size=3, strides=1, activation='relu', padding='same')(x)
        x = LayerNormalization(axis=2)(x)
        x = MaxPool1D(strides=2)(x)
        #x = Dropout(0.1)(x)
        
        x = GlobalAveragePooling1D()(x)
        x = Dropout(0.1)(x)
        x = Dense(numclass, activation='softmax')(x)
        output_model = Model(inputs=datainp, outputs=x)
        return output_model
    
   
modelcreator = Model_Creator()
model = modelcreator.getmodel(model_name = 'cnn_model', input_shape=(50,50), numclass=2)
model.summary()
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Run model

model.fit(train_ds, validation_data = val_ds,epochs=10)

When I run this, I get the error,

ValueError: slice index 0 of dimension 0 out of bounds. for '{{node strided_slice}} = StridedSlice[Index=DT_INT32, T=DT_INT32, begin_mask=0, ellipsis_mask=0, end_mask=0, new_axis_mask=0, shrink_axis_mask=1](Shape, strided_slice/stack, strided_slice/stack_1, strided_slice/stack_2)' with input shapes: [0], [1], [1], [1] and with computed input tensors: input[1] = <0>, input[2] = <1>, input[3] = <1>.

I don't understand why is this error. I can read and parse the data from tfrecords, but can't able to use it for training(model.fit()).

P.S: I have written the code to be reproduced easily. I'm sticking with it for the past two days. Looking forward for some help. Thanks in advance.


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Reply

0 votes
by (71.8m points)
等待大神答复

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
OGeek|极客中国-欢迎来到极客的世界,一个免费开放的程序员编程交流平台!开放,进步,分享!让技术改变生活,让极客改变未来! Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...