猫眼电影网站电影_时长_数据分析与可视化
作者:互联网
import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class Movie11Mapper extends Mapper<LongWritable,Text,Text,DoubleWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //super.map(key,value,context); String line = value.toString(); //[0 芳华 9.1 http://maoyan.com/films/1170264 剧情,爱情,战争 中国大陆 大陆上映 136 2017] String[] arr = line.split("\00"); String type = null; double time = 0; if (arr.length > 7) { time = Double.parseDouble(arr[7]); if (arr.length > 4) { type = arr[4]; if (type == null || "".equals(type)) { return; } else if (type.contains(",")) { type = type.split(",")[0]; } context.write(new Text(type), new DoubleWritable(time)); } } } }
import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class Movie11Reducer extends Reducer<Text, DoubleWritable,Text,DoubleWritable> { @Override protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException { //super.reduce(key, values, context); double sum=0; double sc=0; double avg_time; for (DoubleWritable i:values){ sum+=i.get(); sc++; } String str=String.format("%.1f",sum/sc); //保留两位小数 avg_time=Double.parseDouble(str); context.write(key,new DoubleWritable(avg_time)); } }
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class Movie11Runner { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf=new Configuration(); //创建job Job job= Job.getInstance(conf,"maoyan"); //设置输入输出路径 FileInputFormat.addInputPath(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); //设置运行类 job.setJarByClass(Movie11Runner.class); job.setMapperClass(Movie11Mapper.class); job.setReducerClass(Movie11Reducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); System.exit(job.waitForCompletion(true)?0:1); } }
import pandas as pd import matplotlib import matplotlib.pyplot as plt matplotlib.rcParams['font.family']='SimHei' matplotlib.rcParams['font.sans-serif'] = ['SimHei'] data=pd.read_csv(r"E:\output11\part-r-00000",sep='\t',header=None) data.columns=['类型','上映电影数量'] data.head() plt.figure(figsize=(12,6)) x=data['类型'] y=data['上映电影平均时长'] plt.bar(x,y,color='g',width=0.5,label='分数') plt.xlabel('电影类型') plt.ylabel('平均时长') plt.xticks(rotation=30) plt.title('上映电影类型的平均时长图表') plt.legend(fontsize=12) plt.show()
标签:电影,hadoop,job,可视化,io,org,apache,import,时长 来源: https://www.cnblogs.com/modikasi/p/16642090.html