其他分享
首页 > 其他分享> > 排序模型进阶-Wide&Deep

排序模型进阶-Wide&Deep

作者:互联网

日萌社

人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)


8.5 排序模型进阶-Wide&Deep

学习目标

8.5.1 wide&deep

tf.estimator传入参数原则

代码:

import tensorflow as tf

class WDL(object):
    """wide&deep模型
    """
    def __init__(self):
        pass

    @staticmethod
    def read_ctr_records():
        # 定义转换函数,输入时序列化的
        def parse_tfrecords_function(example_proto):
            features = {
                "label": tf.FixedLenFeature([], tf.int64),
                "feature": tf.FixedLenFeature([], tf.string)
            }
            parsed_features = tf.parse_single_example(example_proto, features)

            feature = tf.decode_raw(parsed_features['feature'], tf.float64)
            feature = tf.reshape(tf.cast(feature, tf.float32), [1, 121])
            # 特征顺序 1 channel_id,  100 article_vector, 10 user_weights, 10 article_weights
            # 1 channel_id类别型特征, 100维文章向量求平均值当连续特征,10维用户权重求平均值当连续特征
            channel_id = tf.cast(tf.slice(feature, [0, 0], [1, 1]), tf.int32)
            vector = tf.reduce_sum(tf.slice(feature, [0, 1], [1, 100]), axis=1)
            user_weights = tf.reduce_sum(tf.slice(feature, [0, 101], [1, 10]), axis=1)
            article_weights = tf.reduce_sum(tf.slice(feature, [0, 111], [1, 10]), axis=1)

            label = tf.cast(parsed_features['label'], tf.float32)

            # 构造字典 名称-tensor
            FEATURE_COLUMNS = ['channel_id', 'vector', 'user_weigths', 'article_weights']
            tensor_list = [channel_id, vector, user_weights, article_weights]

            feature_dict = dict(zip(FEATURE_COLUMNS, tensor_list))

            return feature_dict, label

        dataset = tf.data.TFRecordDataset(["./train_ctr_201905.tfrecords"])
        dataset = dataset.map(parse_tfrecords_function)
        dataset = dataset.batch(64)
        dataset = dataset.repeat()
        return dataset

    def build_estimator(self):
        """建立模型
        :param dataset:
        :return:
        """
        # 离散分类
        article_id = tf.feature_column.categorical_column_with_identity('channel_id', num_buckets=25)
        # 连续类型
        vector = tf.feature_column.numeric_column('vector')
        user_weigths = tf.feature_column.numeric_column('user_weigths')
        article_weights = tf.feature_column.numeric_column('article_weights')

        wide_columns = [article_id]

        # embedding_column用来表示类别型的变量
        deep_columns = [tf.feature_column.embedding_column(article_id, dimension=25),
                        vector, user_weigths, article_weights]

        estimator = tf.estimator.DNNLinearCombinedClassifier(model_dir="./ckpt/wide_and_deep",
                                                             linear_feature_columns=wide_columns,
                                                             dnn_feature_columns=deep_columns,
                                                             dnn_hidden_units=[1024, 512, 256])

        return estimator


if __name__ == '__main__':
    wdl = WDL()
    estimator = wdl.build_estimator()
    estimator.train(input_fn=wdl.read_ctr_records)
    eval_result = estimator.evaluate(input_fn=wdl.read_ctr_records)
    print(eval_result)

8.5.2 三个版本特征数据处理效果对比

特征不同的处理效果baseline1离三特征、文章向量平均值、用户权重平均值、文章权重平均值1离散特征、1个111连续特征1离散特征、100个连续文章向量、10文章权重、10用户权重
accuracy0.90514380530973450.90464350.90464350.9046435
auc0.7192745210040870.578505750.58969390.62383443

效果对比总结:

三个版本特征处理数据函数以及构建模型

@staticmethod
    def read_ctr_records_v1():
        # 定义转换函数,输入时序列化的
        def parse_tfrecords_function(example_proto):
            features = {
                "label": tf.FixedLenFeature([], tf.int64),
                "feature": tf.FixedLenFeature([], tf.string)
            }
            parsed_features = tf.parse_single_example(example_proto, features)

            feature = tf.decode_raw(parsed_features['feature'], tf.float64)
            feature = tf.reshape(tf.cast(feature, tf.float32), [1, 121])
            # 特征顺序 1 channel_id,  100 article_vector, 10 user_weights, 10 article_weights
            # 1 channel_id类别型特征, 100维文章向量求平均值当连续特征,10维用户权重求平均值当连续特征
            channel_id = tf.cast(tf.slice(feature, [0, 0], [1, 1]), tf.int32)
            vector = tf.reduce_mean(tf.slice(feature, [0, 1], [1, 100]), axis=1)
            user_weights = tf.reduce_mean(tf.slice(feature, [0, 101], [1, 10]), axis=1)
            article_weights = tf.reduce_mean(tf.slice(feature, [0, 111], [1, 10]), axis=1)

            label = tf.cast(parsed_features['label'], tf.float32)

            # 构造字典 名称-tensor
            FEATURE_COLUMNS = ['channel_id', 'vector', 'user_weights', 'article_weights']
            tensor_list = [channel_id, vector, user_weights, article_weights]

            feature_dict = dict(zip(FEATURE_COLUMNS, tensor_list))

            return feature_dict, label

        dataset = tf.data.TFRecordDataset(["./ctr_train_20190706.tfrecords"])
        dataset = dataset.map(parse_tfrecords_function)
        dataset = dataset.batch(64)
        dataset = dataset.repeat(100)
        return dataset



    def build_estimator(self):
        """
        构建特征列输入到模型中
        :return:
        """
        # 指定列特征
        channel_id = tf.feature_column.categorical_column_with_identity('channel_id', num_buckets=25)

        vector = tf.feature_column.numeric_column('vector')
        user_weights = tf.feature_column.numeric_column('user_weights')
        article_weights = tf.feature_column.numeric_column('article_weights')

        # wide侧
        wide_columns = [channel_id]

        # deep侧
        deep_columns = [
            tf.feature_column.embedding_column(channel_id, dimension=25),
            vector,
            user_weights,
            article_weights
        ]

        # 构造模型
        estimator = tf.estimator.DNNLinearCombinedClassifier(model_dir="./tmp/ckpt/wide_and_deep",
                                                             linear_feature_columns=wide_columns,
                                                             dnn_feature_columns=deep_columns,
                                                             dnn_hidden_units=[256, 128, 64])
        return estimator
@staticmethod
    def read_ctr_records_v2():
        # 定义转换函数,输入时序列化的
        def parse_tfrecords_function(example_proto):
            features = {
                "label": tf.FixedLenFeature([], tf.int64),
                "feature": tf.FixedLenFeature([], tf.string)
            }
            parsed_features = tf.parse_single_example(example_proto, features)

            feature = tf.decode_raw(parsed_features['feature'], tf.float64)
            feature = tf.reshape(tf.cast(feature, tf.float32), [1, 121])
            channel_id = tf.cast(tf.slice(feature, [0, 0], [1, 1]), tf.int32)

            label = tf.cast(parsed_features['label'], tf.float32)

            # 构造字典 名称-tensor
            FEATURE_COLUMNS = ['channel_id', 'feature']

            tensor_list = [channel_id, feature]

            feature_dict = dict(zip(FEATURE_COLUMNS, tensor_list))

            return feature_dict, label

        dataset = tf.data.TFRecordDataset(["./ctr_train_20190706.tfrecords"])
        dataset = dataset.map(parse_tfrecords_function)
        dataset = dataset.batch(64)
        dataset = dataset.repeat(100)
        return dataset

def build_estimator_v2(self):
        """
        构建特征列输入到模型中
        :return:
        """
        # 指定列特征
        channel_id = tf.feature_column.categorical_column_with_identity('channel_id', num_buckets=25)

        feature = tf.feature_column.numeric_column('feature', shape=[1, 121])

        # wide侧
        wide_columns = [channel_id]

        # deep侧
        deep_columns = [
            tf.feature_column.embedding_column(channel_id, dimension=25),
            feature
        ]

        # 构造模型
        estimator = tf.estimator.DNNLinearCombinedClassifier(model_dir="./tmp/ckpt/wide_and_deep_v2",
                                                             linear_feature_columns=wide_columns,
                                                             dnn_feature_columns=deep_columns,
                                                             dnn_hidden_units=[256, 128, 64])
        return estimator
@staticmethod
    def read_ctr_records_v3():
        # 定义转换函数,输入时序列化的
        def parse_tfrecords_function(example_proto):
            features = {
                "label": tf.FixedLenFeature([], tf.int64),
                "feature": tf.FixedLenFeature([], tf.string)
            }
            parsed_features = tf.parse_single_example(example_proto, features)

            feature = tf.decode_raw(parsed_features['feature'], tf.float64)
            feature = tf.reshape(tf.cast(feature, tf.float32), [1, 121])
            channel_id = tf.cast(tf.slice(feature, [0, 0], [1, 1]), tf.int32)
            vector = tf.slice(feature, [0, 1], [1, 100])
            user_weights = tf.slice(feature, [0, 101], [1, 10])
            article_weights = tf.slice(feature, [0, 111], [1, 10])

            label = tf.cast(parsed_features['label'], tf.float32)

            # 构造字典 名称-tensor
            FEATURE_COLUMNS = ['channel_id', 'vector', 'user_weights', 'article_weights']

            tensor_list = [channel_id, vector, user_weights, article_weights]

            feature_dict = dict(zip(FEATURE_COLUMNS, tensor_list))

            return feature_dict, label

        dataset = tf.data.TFRecordDataset(["./ctr_train_20190706.tfrecords"])
        dataset = dataset.map(parse_tfrecords_function)
        dataset = dataset.batch(64)
        dataset = dataset.repeat(100)
        return dataset


def build_estimator_v3(self):
        """
        构建特征列输入到模型中
        :return:
        """
        # 指定列特征
        channel_id = tf.feature_column.categorical_column_with_identity('channel_id', num_buckets=25)

        vector = tf.feature_column.numeric_column('vector', shape=[1, 100])
        user_weights = tf.feature_column.numeric_column('user_weights', shape=[1, 10])
        article_weights = tf.feature_column.numeric_column('article_weights', shape=[1, 10])

        # wide侧
        wide_columns = [channel_id]

        # deep侧
        deep_columns = [
            tf.feature_column.embedding_column(channel_id, dimension=25),
            vector,
            user_weights,
            article_weights
        ]

        # 构造模型
        estimator = tf.estimator.DNNLinearCombinedClassifier(model_dir="./tmp/ckpt/wide_and_deep_v3",
                                                             linear_feature_columns=wide_columns,
                                                             dnn_feature_columns=deep_columns,
                                                             dnn_hidden_units=[256, 128, 64])
        return estimator

标签:Wide,进阶,column,feature,Deep,dataset,weights,tf,id
来源: https://blog.csdn.net/zimiao552147572/article/details/106859072