爬取Macy网用户评价日志(1):需求存储数据库设计
作者:互联网
一、数据库设计如图:
mysql> show tables;
+-------------------------------------+
| Tables_in_macy |
+-------------------------------------+
| rank1_cate_urls |
| rank2_cate_urls |
| rank3_cate_urls |
| rank4_prod_specific_info |
| rank4_prod_specific_review_products |
| rank4_prod_specific_review_reviews |
+-------------------------------------+
二、一共设计了6个tables.然后,分别存放
1) 第一级url,
mysql> describe rank1_cate_urls;
+-------------------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+-------------------+--------------+------+-----+---------+-------+
| cate_name | varchar(100) | NO | | NULL | |
| cate_url | varchar(500) | NO | PRI | NULL | |
| request_situation | varchar(10) | YES | | NULL | |
+-------------------+--------------+------+-----+---------+-------+
2) 第二级url,
mysql> describe rank2_cate_urls;
+-------------------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+-------------------+--------------+------+-----+---------+-------+
| upper_cate_url | varchar(500) | NO | MUL | NULL | |
| upper_cate_name | varchar(100) | YES | | NULL | |
| cate_name | varchar(100) | NO | | NULL | |
| cate_url | varchar(500) | NO | PRI | NULL | |
| request_situation | varchar(10) | YES | | NULL | |
+-------------------+--------------+------+-----+---------+-------+
5 rows in set (0.00 sec)
3) 第三级url.
mysql> describe rank3_cate_urls;
+-------------------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+-------------------+--------------+------+-----+---------+-------+
| upper_cate_url | varchar(500) | NO | MUL | NULL | |
| upper_cls_name | varchar(100) | YES | | NULL | |
| cate_name | varchar(100) | NO | | NULL | |
| cate_url | varchar(500) | NO | PRI | NULL | |
| request_situation | varchar(10) | YES | | NULL | |
| now_price | float | YES | | NULL | |
| old_price | float | YES | | NULL | |
| web_id | varchar(10) | YES | | NULL | |
+-------------------+--------------+------+-----+---------+-------+
8 rows in set (0.00 sec)
4) 具体产品信息。
mysql> describe rank4_prod_specific_info
-> ;
+-----------------------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+-----------------------+--------------+------+-----+---------+-------+
| prod_id | varchar(10) | NO | PRI | NULL | |
| prod_name | varchar(100) | YES | | NULL | |
| prod_url | varchar(500) | YES | | NULL | |
| page_prod_name | varchar(200) | YES | | NULL | |
| prod_before_price | varchar(100) | YES | | NULL | |
| prod_discount_price | varchar(20) | YES | | NULL | |
| prod_now_price | varchar(100) | YES | | NULL | |
| request_situation | varchar(10) | YES | | NULL | |
| page_web_id | varchar(10) | YES | | NULL | |
| img_request_situation | varchar(10) | YES | | NULL | |
+-----------------------+--------------+------+-----+---------+-------+
10 rows in set (0.01 sec)
5) 根据review package提取的product信息,
mysql> describe rank4_prod_specific_review_products;
+------------------------------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+------------------------------+--------------+------+-----+---------+-------+
| description | varchar(500) | YES | | NULL | |
| imageUrl | varchar(500) | YES | | NULL | |
| name | varchar(500) | YES | | NULL | |
| productId | varchar(10) | NO | PRI | NULL | |
| productPageUrl | varchar(500) | YES | | NULL | |
| reviewStatistics | varchar(5) | YES | | NULL | |
| averageOverallRating | float | YES | | NULL | |
| averageSliders | varchar(45) | YES | | NULL | |
| notRecommendedCount | int | YES | | NULL | |
| overallRatingRange | int | YES | | NULL | |
| ratingDistribution | varchar(5) | YES | | NULL | |
| ratingPercentage | int | YES | | NULL | |
| ratingsOnlyReviewCount | int | YES | | NULL | |
| recommendedCount | int | YES | | NULL | |
| secondaryRatingsAverages | varchar(45) | YES | | NULL | |
| secondaryRatingAveragesOrder | varchar(45) | YES | | NULL | |
| totalReviewCount | int | YES | | NULL | |
| locale | varchar(45) | YES | | NULL | |
+------------------------------+--------------+------+-----+---------+-------+
18 rows in set (0.01 sec)
6) 根据review package提取的review信息。
mysql> describe rank4_prod_specific_review_reviews;
+----------------------------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+----------------------------+--------------+------+-----+---------+-------+
| anonymous | varchar(5) | YES | | NULL | |
| authorId | varchar(50) | NO | | NULL | |
| badges | varchar(45) | YES | | NULL | |
| badgesOrder | varchar(45) | YES | | NULL | |
| clientResponses | varchar(45) | YES | | NULL | |
| contentLocale | varchar(50) | YES | | NULL | |
| contextDataValues | varchar(45) | YES | | NULL | |
| contextDataValuesOrder | varchar(45) | NO | | NULL | |
| displayName | varchar(50) | YES | | NULL | |
| featured | varchar(5) | YES | | NULL | |
| incentivizedReview | varchar(5) | YES | | NULL | |
| lastModificationTime | varchar(50) | NO | | NULL | |
| moderationStatus | varchar(50) | YES | | NULL | |
| photos | varchar(45) | YES | | NULL | |
| productId | varchar(10) | NO | MUL | NULL | |
| rating | float | YES | | NULL | |
| ratingPercentage | int | YES | | NULL | |
| ratingRange | int | YES | | NULL | |
| ratingsOnly | varchar(5) | YES | | NULL | |
| recommended | varchar(5) | YES | | NULL | |
| reviewId | varchar(50) | NO | PRI | NULL | |
| reviewText | varchar(500) | YES | | NULL | |
| reviewerId | varchar(50) | NO | | NULL | |
| secondaryRatings | varchar(45) | YES | | NULL | |
| submissionTime | varchar(50) | YES | | NULL | |
| syndicated | varchar(5) | YES | | NULL | |
| title | varchar(500) | YES | | NULL | |
| topContributor | varchar(5) | YES | | NULL | |
| totalFeedbackCount | int | YES | | NULL | |
| totalNegativeFeedbackCount | int | YES | | NULL | |
| totalPositiveFeedbackCount | int | YES | | NULL | |
| userNickname | varchar(50) | YES | | NULL | |
| videos | varchar(45) | YES | | NULL | |
+----------------------------+--------------+------+-----+---------+-------+
33 rows in set (0.27 sec)
+
标签:prod,varchar,cate,NO,Macy,爬取,日志,NULL,YES 来源: https://www.cnblogs.com/cainiaoxuexi2017-ZYA/p/15902981.html