其他分享
首页 > 其他分享> > 数据分析 day02

数据分析 day02

作者:互联网

3.DataFrame基础操作巩固-股票分析

In [42]:

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import tushare as ts#财经数据接口包
import matplotlib.pyplot as plt

需求:股票分析

In [4]:

df = ts.get_k_data('600519',start='2000-01-01')
df.to_csv('./maotai.csv')

In [50]:

df = pd.read_csv('./maotai.csv')
df.head()

Out[50]:

Unnamed: 0 date open close high low volume code
0 0 2001-08-27 5.392 5.554 5.902 5.132 406318.00 600519
1 1 2001-08-28 5.467 5.759 5.781 5.407 129647.79 600519
2 2 2001-08-29 5.777 5.684 5.781 5.640 53252.75 600519
3 3 2001-08-30 5.668 5.796 5.860 5.624 48013.06 600519
4 4 2001-08-31 5.804 5.782 5.877 5.749 23231.48 600519

In [51]:

df.drop(labels='Unnamed: 0',axis=1,inplace=True)

In [7]:

df.head(3)

Out[7]:

date open close high low volume code
0 2001-08-27 5.392 5.554 5.902 5.132 406318.00 600519
1 2001-08-28 5.467 5.759 5.781 5.407 129647.79 600519
2 2001-08-29 5.777 5.684 5.781 5.640 53252.75 600519

In [8]:

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4386 entries, 0 to 4385
Data columns (total 7 columns):
date      4386 non-null object
open      4386 non-null float64
close     4386 non-null float64
high      4386 non-null float64
low       4386 non-null float64
volume    4386 non-null float64
code      4386 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 239.9+ KB

In [10]:

df.describe()#聚合操作

Out[10]:

open close high low volume code
count 4386.000000 4386.000000 4386.000000 4386.000000 4386.000000 4386.0
mean 198.346553 198.656232 201.125280 195.931863 27921.072905 600519.0
std 260.048946 260.403673 263.249482 257.143597 24503.505290 0.0
min 4.049000 4.045000 4.068000 4.012000 238.100000 600519.0
25% 27.526000 27.536750 27.820250 27.093000 11310.195000 600519.0
50% 113.967500 113.987500 115.515500 112.401000 23793.000000 600519.0
75% 194.410000 193.878000 197.158500 191.120750 37651.250000 600519.0
max 1231.000000 1233.750000 1241.610000 1228.060000 406318.000000 600519.0

In [52]:

#将date列的数据转成时间序列且将其作为源数据的行索引
df['date'] = pd.to_datetime(df['date'])

In [53]:

df.set_index('date',inplace=True)

In [54]:

df.head()

Out[54]:

open close high low volume code
date
2001-08-27 5.392 5.554 5.902 5.132 406318.00 600519
2001-08-28 5.467 5.759 5.781 5.407 129647.79 600519
2001-08-29 5.777 5.684 5.781 5.640 53252.75 600519
2001-08-30 5.668 5.796 5.860 5.624 48013.06 600519
2001-08-31 5.804 5.782 5.877 5.749 23231.48 600519

In [18]:

df.loc[(df['close'] - df['open'])/df['open'] >  0.03 ].index

In [21]:

df.loc[(df['open'] - df['close'].shift(1))/df['close'].shift(1) < -0.02].index

In [22]:

data = df['2010':'2020']
data.head()

In [31]:

data_monthly = data.resample('M').first()
cost_money = data_monthly['open'].sum()*100

In [32]:

data_yearly = data.resample('A').last()[:-1]
recv_money = data_yearly['open'].sum()*1200

In [33]:

last_money = data['open'][-1] * 100

In [34]:

last_money+recv_money-cost_money

Out[34]:

567728.6999999997

需求:双均线策略制定

In [ ]:

In [64]:

#ma表示的是均线
ma5 = df['close'].rolling(5).mean()
ma30 = df['close'].rolling(30).mean()

In [65]:

#将ma5和ma30汇总到源数据中
df['ma5'] = ma5
df['ma30'] = ma30
df

In [46]:

plt.plot(ma5[50:100],c='red')
plt.plot(ma30[50:100],c='blue')

Out[46]:

[<matplotlib.lines.Line2D at 0x1b084f37550>]

img

In [ ]:

In [68]:

df = df['2010':'2020']
df

In [74]:

sr1 = df['ma5'] < df['ma30']
sr2 = df['ma5'] >= df['ma30']

In [77]:

df.loc[sr1 & sr2.shift(1)] #死叉对应的行数据
death_dates = df.loc[sr1 & sr2.shift(1)].index

In [79]:

df.loc[~(sr1 | sr2.shift(1))]#金叉对应的行数据
golden_dates = df.loc[~(sr1 | sr2.shift(1))].index

In [80]:

golden_dates

In [96]:

#基于金叉和死叉买卖股票计算收益
first_money = 100000
money = first_money
hold = 0 #持有股票的数量(股)

s1 = Series(1,index=golden_dates)#1标识金叉日期
s2 = Series(0,index=death_dates)#0表示死叉日期
s = s1.append(s2) #存储的是所有的金叉和死叉日期
s = s.sort_index() #根据索引排序

for i in s.index:
    #开盘价作为买卖的单价
    price = df.loc[i]['open']
    if s[i] == 1:#金叉:买入
        hand_cost = 100 * price#1手股票花费的钱数
        hand_count = money // hand_cost #最多买入了多少手股票
        hold = hand_count * 100 #买入的多少只股票
        money -= hold*price
    else:
        money += hold * price
        hold = 0
        
#如果最后一天为金叉,最后一天买入股票,没有卖出。剩余的股票也要计算到总收益中
last_money = hold * df['open'][-1]
print(money + last_money - first_money)
1501254.9999999995
4.基于pandas的数据清洗

Python 3

Not Trusted

Run

处理丢失数据

In [1]:

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import tushare as ts#财经数据接口包
import matplotlib.pyplot as plt

In [2]:

type(np.nan)

Out[2]:

float

In [5]:

np.nan + 3

Out[5]:

nan

In [3]:

type(None)

Out[3]:

NoneType

In [10]:

df = DataFrame(data=np.random.randint(0,100,size=(8,5)))
df

Out[10]:

0 1 2 3 4
0 44 91 92 51 55
1 23 22 92 35 83
2 21 52 40 63 29
3 94 51 24 70 59
4 27 78 1 21 17
5 94 57 5 43 22
6 87 31 58 30 82
7 93 28 54 7 93

In [12]:

df.iloc[1,2] = None
df.iloc[3,4] = None
df.iloc[4,1] = None
df.iloc[7,4] = np.nan

In [13]:

df

Out[13]:

0 1 2 3 4
0 44 91.0 92.0 51 55.0
1 23 22.0 NaN 35 83.0
2 21 52.0 40.0 63 29.0
3 94 51.0 24.0 70 NaN
4 27 NaN 1.0 21 17.0
5 94 57.0 5.0 43 22.0
6 87 31.0 58.0 30 82.0
7 93 28.0 54.0 7 NaN

pandas处理空值操作

In [16]:

df.isnull()

Out[16]:

0 1 2 3 4
0 False False False False False
1 False False True False False
2 False False False False False
3 False False False False True
4 False True False False False
5 False False False False False
6 False False False False False
7 False False False False True

In [20]:

df.isnull()

Out[20]:

0 1 2 3 4
0 False False False False False
1 False False True False False
2 False False False False False
3 False False False False True
4 False True False False False
5 False False False False False
6 False False False False False
7 False False False False True

In [24]:

~df.isnull().any(axis=1)
df.loc[~df.isnull().any(axis=1)]

Out[24]:

0 1 2 3 4
0 44 91.0 92.0 51 55.0
2 21 52.0 40.0 63 29.0
5 94 57.0 5.0 43 22.0
6 87 31.0 58.0 30 82.0

In [28]:

df.notnull().all(axis=1)
df.loc[df.notnull().all(axis=1)]

Out[28]:

0 1 2 3 4
0 44 91.0 92.0 51 55.0
2 21 52.0 40.0 63 29.0
5 94 57.0 5.0 43 22.0
6 87 31.0 58.0 30 82.0

In [29]:

df.dropna(axis=0)  #将空值对应的行数据删除

Out[29]:

0 1 2 3 4
0 44 91.0 92.0 51 55.0
2 21 52.0 40.0 63 29.0
5 94 57.0 5.0 43 22.0
6 87 31.0 58.0 30 82.0

In [32]:

df

Out[32]:

0 1 2 3 4
0 44 91.0 92.0 51 55.0
1 23 22.0 NaN 35 83.0
2 21 52.0 40.0 63 29.0
3 94 51.0 24.0 70 NaN
4 27 NaN 1.0 21 17.0
5 94 57.0 5.0 43 22.0
6 87 31.0 58.0 30 82.0
7 93 28.0 54.0 7 NaN

In [33]:

#fillna将空值进行覆盖
df.fillna(method='ffill',axis=0) #使用紧邻值填充空值

Out[33]:

0 1 2 3 4
0 44 91.0 92.0 51 55.0
1 23 22.0 92.0 35 83.0
2 21 52.0 40.0 63 29.0
3 94 51.0 24.0 70 29.0
4 27 51.0 1.0 21 17.0
5 94 57.0 5.0 43 22.0
6 87 31.0 58.0 30 82.0
7 93 28.0 54.0 7 82.0

面试题

In [ ]:

处理重复数据

In [ ]:

处理异常数据

5.DataFrame的级联and合并操作

级联操作

pandas使用pd.concat函数,与np.concatenate函数类似,只是多了一些参数:

objs
axis=0
keys
join='outer' / 'inner':表示的是级联的方式,outer会将所有的项进行级联(忽略匹配和不匹配),而inner只会将匹配的项级联到一起,不匹配的不级联
ignore_index=False

In [ ]:

合并操作

一对一合并

In [ ]:

df1 = DataFrame({'employee':['Bob','Jake','Lisa'],
                'group':['Accounting','Engineering','Engineering'],
                })

In [ ]:

df2 = DataFrame({'employee':['Lisa','Bob','Jake'],
                'hire_date':[2004,2008,2012],
                })

一对多合并

In [ ]:

df3 = DataFrame({
    'employee':['Lisa','Jake'],
    'group':['Accounting','Engineering'],
    'hire_date':[2004,2016]})

In [ ]:

df4 = DataFrame({'group':['Accounting','Engineering','Engineering'],
                       'supervisor':['Carly','Guido','Steve']
                })

多对多合并

In [ ]:

df1 = DataFrame({'employee':['Bob','Jake','Lisa'],
                 'group':['Accounting','Engineering','Engineering']})

In [ ]:

df5 = DataFrame({'group':['Engineering','Engineering','HR'],
                'supervisor':['Carly','Guido','Steve']
                })

key的规范化

In [ ]:

df1 = DataFrame({'employee':['Jack',"Summer","Steve"],
                 'group':['Accounting','Finance','Marketing']})

In [ ]:

df2 = DataFrame({'employee':['Jack','Bob',"Jake"],
                 'hire_date':[2003,2009,2012],
                'group':['Accounting','sell','ceo']})

In [ ]:

df1 = DataFrame({'employee':['Bobs','Linda','Bill'],
                'group':['Accounting','Product','Marketing'],
               'hire_date':[1998,2017,2018]})

In [ ]:

df5 = DataFrame({'name':['Lisa','Bobs','Bill'],
                'hire_dates':[1998,2016,2007]})

内合并与外合并:out取并集 inner取交集

In [ ]:

df6 = DataFrame({'name':['Peter','Paul','Mary'],
               'food':['fish','beans','bread']}
               )
df7 = DataFrame({'name':['Mary','Joseph'],
                'drink':['wine','beer']})

In [ ]:

df6 = DataFrame({'name':['Peter','Paul','Mary'],
               'food':['fish','beans','bread']}
               )
df7 = DataFrame({'name':['Mary','Joseph'],
                'drink':['wine','beer']})

In [ ]:

#合并df1和df2
dic1={
    
    'name':['tom','jay','helly'],
    'age':[11,12,33],
    'classRoom':[1,2,3]
}
df1=DataFrame(data=dic1)
df2=DataFrame(data=np.random.randint(60,100,size=(3,3)),
              index=['jay','tom','helly'],
             columns=['java','python','c'])

标签:数据分析,False,df,day02,DataFrame,600519,money,Out
来源: https://www.cnblogs.com/bky20061005/p/12192443.html