编程语言
首页 > 编程语言> > 【Python】本专栏要点复习

【Python】本专栏要点复习

作者:互联网

本文是对本专栏的简单复习。
至此,本专栏已经完结。后续最多只是修补一些内容上的bug了。
写这个专栏的初衷其实是为了自己的复习,如果这些内容对你的学习能起到帮助,那便是我的荣幸。

最后的感悟大概就是:
依然要勤学苦练,最终与实践结合。我们学习代码这个工具就是为了实战使用,而不只是学习函数。
一定要持之以恒的学习,并与实践融合。

本文于2021/12/22首发于CSDN,有不足请指出。

'''1.爬虫'''
import chardet
import requests

url = ''
ua = {"User-Agent": " "}
rqg = requests.get(url, headers=ua)
rqg.encoding = chardet.detect(rqg.content)['encoding']

html = rqg.content.decode('utf-8')

from bs4 import BeautifulSoup
from lxml import etree
soup = BeautifulSoup(html, 'lxml')
tag = soup.ul
tag.attrs
tag.li.get_text()
tag.get('herf')

soup.find_all('ul')
urls = []
herfs = []
for i in tag.find_all('a'):
    urls.append(i.get_text())
    herfs.apepend(i.get('herf'))
for i in tag.find_all('a'):
    print(i.get('herf'), i.get_text(), end ='\n' )

xp = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
xp.xpath('//a')
xp.xpath("body/div/a[starts-with(@id,'co')]")
xp.xpath("//a/text()")
j = xp.xpath('//p[starts-with(@id)]')
for i in j:
    t = i.xpath('string(.)')
    print(t)

import re
title_pattern = r'<title (.*?)>(.*?)</title>'
title_com = re.compile(title_pattern, re.M|re.S)
title_find = re.findall(title_com, rqg.text)

import time
from selenium import webdiver
div = webdiver.Chrome('./chromediver')
div.get(url)
time.sleep(5)
html = div.page_source

element = div.find_element_by_id('pass')  # 只返回第一个
elements = div.find_elements_by_name('a')  # 返回列表
element2 = div.find_element_by_xpath("//p[@id='pass]")
element3 = div.find_element_by_tag_name('div')

import time
import requests
import json
url = ''
ua = {"User-Agent": ' '}
# html = requests.get(url, headers=ua).content.decode('utf-8')
html = requests.get(url, headers=ua).text
data = json.loads(html)
dic = data['data']
for i in dic:
    print(i['picPath'], i['bookName'])

import requests
import time
from selenium import webdiver
div1 = webdiver.Chrome('./chromediver')
div1.get(url)
time.sleep(5)
# html = div.page_source
e1 = div.find_elements_by_xpath("//div[@class='book']")
for book in e1:
    print(book.text)
div.quit()

'''2.写入文件'''
with open(r'c:\file.txt', 'a+') as f:
    f.write(rqg.text)

import xlwings as xw
wb = xw.Book(r"c:\excel.xlsx")
sht = wb.sheets['Sheet1']
sht.range('A2').value = ['aaa', 'nnn']  # A2单元格,value值也可以是dataframe

'''3.pandas'''
import numpy as np
import pandas as pd
np.random.random((4, 5))  # [0, 1)浮点数
np.random.rand(4, 5)  # 均匀分布
np.random.randn(4, 5) # (10) 一行十个数数列 正态分布
np.random.randint(5, 10, size=[2, 5])  # [5,10]整数

arr1 = np.arange(4)
arr1.ravel() # 列向展平
arr1.flatten

arr2 = np.arange(1)
arr_st = np.concatenate((arr1, arr2), axis=0)  # axis这是纵向(行向)叠加
arr_sp = np.split(arr1, 2, axis=1)  # 横向(列向)切割  行0,列1

# header=0,无列标题时默认使用。
# 如果有列标题时强行用0,会替换掉列名,列名下面一行当标题。
# =None,有列标题时默认使用
df1 = pd.read_csv(r"c:/df.csv", header=None, index_col='city')
df1.values
df1.index
df1.columns
df1.dtypes
df1.size
df1.ndim
df1.shape

df1.describe()
df1.info()
df1['city'].mean()  # var, std

# df2 = df1.set_index('city')

df1.loc[:, ['city', 'sex']]
df1[['city', 'sex']]
df1.iloc[:, :2]
df1[:2]
# df1[:, :2] 不对
# df1[0] 不对
df1.loc[df1]
df1[(df1["city"] == '北京') & (df1["sex"] == 'female')]

data = {"city":'lanzhou', "sex":"female"}
df1.append(data, ignore_index=True)  # 防止索引冲突
df1['age'] = [20, 19, 21]
df1.drop([1, 3])  # 删除1,3行
df1.drop(columns=["age", "city"])  # index=  ,或用axis

pd.to_csv(r"C:\i.csv", sep=',')

groupby = df1.groupby('分公司')[['薪水', '小时报酬']].agg['min', 'max']

pivot_table = df1.pivot_table(values=['小时报酬', '薪水'], index=['分公司', '部门'])

concat_join = pd.concat([df1[:2], df1[2:]], axis=1, join='inner')
# inner内连接去除悬浮元组,outer保留悬浮元组
merge = pd.merge(df1[:2], df1[2:], left_on='学号', right_on="学号")
combine = df1.combine_first(df2)  # 对比合并重复数据

df1["姓名"].drop_duplicates()  # 去除重复值

df1.isnull()  # notnull
df1.dropna(axis=0, how='all')  # all行全缺失值才删,any有就删
df1['小时报酬'].fillna(df1['小时报酬'].mean())

# inter1d, make_interp_spline
from scipy.interplote import lagrange
l1 = lagrange(x, y1)
l1([6, 7])  # x=6,7时,y1的结果

def outRange(ser):
    bool = (ser < ser.mean() -3*ser.std()) | (ser > ser.mean() + 3*ser.std())
    index = np.arange(ser.shape[0])[bool]
    outrange = ser.iloc[index]
    return outrange
outlier = outRange(df1["age"])

pd.get_dummies(df1["name"])  # 哑变量

import matplotlib.pyplot as plt
plt.bar()  # barh,plot,boxplot,stackplot,hist,pie,scatter,polar,errorbar

plt.figure()
plt.xlabel('x轴标签')
plt.xticks(['a', 'b', 'c'])  # 设置刻度标签
plt.title('设置标题')
plt.legend()  # 会自动显示
plt.legend(lines, ['线条1', "线条2"], loc='best')
plt.grid(visible=True)  # 显示网格

ax, fig = plt.subplots(2, 2, figsize=(10,5))
ax1 = ax[1, 0]
ax1.plot()  # 画ax1的图

标签:plt,复习,get,Python,df1,专栏,np,import,div
来源: https://blog.csdn.net/amakusa_/article/details/122096306