首页 > 其他分享> > 使用Flume http进行数据采集

使用Flume http进行数据采集

2022-02-25 23:00:07 作者：互联网

1. 编写Flume http配置文件

[root@sam01 scripts]# vim collect-app-http.conf

# filename: collect-app-http.conf
# 定义一个名字为 b1001 的agent
# 定义channel
b1001.channels = ch-1
# 定义source
b1001.sources = src-1
# 定义sink
b1001.sinks = k1

# sink 接到 channel 上
b1001.sinks.k1.channel = ch-1

# source 接到 channel上
b1001.sources.src-1.channels = ch-1
b1001.sources.src-1.type = http
# http绑定地址
b1001.sources.src-1.bind=0.0.0.0
# http绑定端口
b1001.sources.src-1.port=9666

b1001.sinks.k1.type = hdfs
b1001.sinks.k1.hdfs.path = hdfs://sam01:8020/sources/news-article/%Y%m%d
b1001.sinks.k1.hdfs.filePrefix = news-%Y%m%d_%H
b1001.sinks.k1.hdfs.fileSuffix = .gz
b1001.sinks.k1.hdfs.codeC = gzip
b1001.sinks.k1.hdfs.useLocalTimeStamp = true
b1001.sinks.k1.hdfs.writeFormat = Text
b1001.sinks.k1.hdfs.fileType = CompressedStream
# 禁用安装event条数来滚动生成文件
b1001.sinks.k1.hdfs.rollCount = 0
# 如果一个文件达到10M滚动
b1001.sinks.k1.hdfs.rollSize = 10485760
# 5分钟滚动生成新文件，和文件大小的滚动一起，那个先达到，执行那个
b1001.sinks.k1.hdfs.rollInterval = 600
# 参加上边连接官网说明，理论上batchSize 越大，吞吐越高。 但是HDFS Sink 调用 Hadoop RPC（包括 open、flush、close ..）超时会抛出异常，如果发生在 flush 数据阶段，部分 event 可能已写入 HDFS，事务回滚后当前 BatchSize 的 event 还会再次写入造成数据重复。 batchSize越大可能重复的数据就越多. 同时batchSize值，不能大于channel的transactionCapacity值
b1001.sinks.k1.hdfs.batchSize = 100
# 每个HDFS SINK 开启多少线程来写文件
b1001.sinks.k1.hdfs.threadsPoolSize = 10
# 如果一个文件超过多长时间没有写入，就自动关闭文件，时间单位是秒
b1001.sinks.k1.hdfs.idleTimeout = 60

b1001.channels.ch-1.type = memory
b1001.channels.ch-1.capacity = 10000
b1001.channels.ch-1.transactionCapacity = 100

2. 编写启动Flume http采集数据的脚本

#!/bin/sh
# filename: start-flume-http.sh
# desc: 启动文章数据的flume agent,agent 名字为 b1001
# date: 2022-02-25
# 请写你安装的FLUME的路径
FLUME_HOME=/usr/local/flume/

${FLUME_HOME}/bin/flume-ng agent -c ${FLUME_HOME}/conf -f /opt/scripts/collect-app-http.conf -n b1001 -Dflume.root.logger=INFO,console -Dflume.monitoring.type=http -Dflume.monitoring.port=31002

3. 执行启动脚本start-flume-http.sh

[root@sam01 scripts]# sh start-flume-http.sh

4. 利用frp内网穿透将端口发送给管理中心

[root@sam01 frp]# ./frpc http --sd sam02 -l 9666 -s frp.qfbigdata.com:7001 -u sam02

5. 进行配置

curl -X POST \
  http://metadata.frp.qfbigdata.com:8112/api/v1/meta/register \
  -F data_url=http://sam02.frp.qfbigdata.com:8002 \
  -F type=2 \
  -F name=sam02

6. 查看hdfs的UI界面

7. 下载数据文件

[root@sam01 ~]# hdfs dfs -get /sources/news-article/20220225/news-20220225_21.1645796829851.gz

8. 解压所下载的文件

[root@sam01 ~]# gzip -d news-20220225_21.1645796829851.gz

9. 查看解压的文件

[root@sam01 ~]# cat news-20220225_21.1645796829851

标签：Flume,hdfs,http,sinks,sources,采集,k1,b1001
来源： https://www.cnblogs.com/sam0/p/15938110.html