数据载入、存储及文件格式 ¶

文本格式数据的读写 ¶

import numpy as np
import pandas as pd
import sys
import csv
import json

将表格型数据读取为DataFrame对象是pandas的重要特性。下面是部分实现文件读取功能的函数，read_csv和read_table可能是后期我们使用最多的函数。

这些函数的可选参数主要有以下几种类型：

索引：可以将一列或多个列作为返回的DataFrame，从文件或用户处获得列名，或者没有列名。
类型推断和数据转换：包括用户自定义的值转换和自定义的缺失值符号列表。
日期时间解析：包括组合功能，也包括将分散在多个列上的日期和时间信息组合成结果中的单个列。
迭代：支持对大型文件的分块迭代。
未清洗数据问题：跳过行、页脚、注释以及其他次要数据，比如使用逗号分隔千位的数字。

file01 = '../examples/ex1.csv'
# 使用read_csv将文件读入一个DataFrame
df = pd.read_csv(file01)
print(df)
#    1   2   3   4  hello
# 0  5   6   7   8  world
# 1  9  10  11  12    foo

df = pd.read_csv(file01, header=None)  # 使用pandas自动分配默认列名
print(df)
#    0   1   2   3      4
# 0  1   2   3   4  hello
# 1  5   6   7   8  world
# 2  9  10  11  12    foo

df = pd.read_csv(file01, names=['aa', 'bb', 'cc', 'dd', 'message'])  # 自己指定列名
print(df)
#    aa  bb  cc  dd     ee
# 0   1   2   3   4  hello
# 1   5   6   7   8  world
# 2   9  10  11  12    foo

# 使用read_table，并指定分隔符，将文件读入一个DataFrame
df = pd.read_table(file01, sep=',')
print(df)
#    a   b   c   d message
# 0  1   2   3   4   hello
# 1  5   6   7   8   world
# 2  9  10  11  12     foo

从多个列中形成一个分层索引

parased = pd.read_csv('../examples/csv_mindex.csv', index_col=['key1', 'key2'])
print(parased)
#            value1  value2
# key1 key2
# one  a          1       2
#      b          3       4
#      c          5       6
#      d          7       8
# two  a          9      10
#      b         11      12
#      c         13      14
#      d         15      16

下例中，由于列名的数量比数据的列数少一个，因此read_table推断第一列应当作为DataFrame的索引。

ex3.txt原始文件内容

A         B         C
aaa   -0.264438         -1.026059   -0.619500
bbb  0.927272    0.302904 -0.032399
ccc      -0.264273      -0.386314        -0.217601
ddd -0.871858 -0.348382          1.100491

result = pd.read_table('../examples/ex3.txt')  # 直接读取
print(result)
#                                                 A         B         C
# aaa   -0.264438                                 -1.026059   -0.619500
# bbb  0.927272    0.302904 -0.032399                               NaN
# ccc      -0.264273      -0.386314                           -0.217601
# ddd -0.871858 -0.348382                                      1.100491
result = pd.read_table('../examples/ex3.txt', sep='\s+')  # 向read_table正则表达式为\s+来格式化文件
print(result)
#             A         B         C
# aaa -0.264438 -1.026059 -0.619500
# bbb  0.927272  0.302904 -0.032399
# ccc -0.264273 -0.386314 -0.217601
# ddd -0.871858 -0.348382  1.100491

下例中ex4.csv原始文件内容

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

result = pd.read_csv('../examples/ex4.csv', skiprows=[0, 2, 3])  # 使用skiprows来跳过第一行、第三行和第四行
print(result)
#    a   b   c   d message
# 0  1   2   3   4   hello
# 1  5   6   7   8   world
# 2  9  10  11  12     foo

缺失值处理

默认情况下，pandas使用一些常见的标识，例如NA和NULL

下例中ex5.csv原始文件内容

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

result = pd.read_csv('../examples/ex5.csv')
print(result)
#   something  a   b     c   d message
# 0       one  1   2   3.0   4     NaN
# 1       two  5   6   NaN   8   world
# 2     three  9  10  11.0  12     foo
print(pd.isnull(result))
#    something      a      b      c      d  message
# 0      False  False  False  False  False     True
# 1      False  False  False   True  False    False
# 2      False  False  False  False  False    False
result = pd.read_csv('../examples/ex5.csv', na_values=['NULL'])
print(result)
#   something  a   b     c   d message
# 0       one  1   2   3.0   4     NaN
# 1       two  5   6   NaN   8   world
# 2     three  9  10  11.0  12     foo

定义替换规则

sentinels = {
    'message': ['foo', 'NA'],
    'something': ['two']
}
result = pd.read_csv('../examples/ex5.csv', na_values=sentinels)

把message列所有值为foo或NA的替换为Null

把something列所有值为two的替换为Null

print(result)
#   something  a   b     c   d message
# 0       one  1   2   3.0   4     NaN
# 1       NaN  5   6   NaN   8   world
# 2     three  9  10  11.0  12     NaN

分块读入文本文件 ¶

pd.options.display.max_rows = 10
result = pd.read_csv('../examples/ex6.csv')  # 读取全部记录
print(result)
result = pd.read_csv('../examples/ex6.csv', nrows=5)  # 读取前5行记录
print(result)
# [10000 rows x 5 columns]
#         one       two     three      four key
# 0  0.467976 -0.038649 -0.295344 -1.824726   L
# 1 -0.358893  1.404453  0.704965 -0.200638   B
# 2 -0.501840  0.659254 -0.421691 -0.057688   G
# 3  0.204886  1.074134  1.388361 -0.982404   R
# 4  0.354628 -0.133116  0.283763 -0.837063   Q
result = pd.read_csv('../examples/ex6.csv', chunksize=1000)  # 分块读入文件，每块1000行
print(result)  # 返回的是一个TextParser对象, 允许你根据chunksize遍历文件。
# <pandas.io.parsers.readers.TextFileReader object at 0x7f2b3cd01730>

可以遍历ex6.csv，并对key列聚合获得计数值

tot = pd.Series([], dtype=float)  # 这里需要显式指定dtype，后续Python会将默认值从float64变成object，目前默认是float64
for piece in result:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)
tot = tot.sort_values(ascending=False)
print(tot[:10])
# E    368.0
# X    364.0
# L    346.0
# O    343.0
# Q    340.0
# M    338.0
# J    337.0
# F    335.0
# K    334.0
# H    330.0
# dtype: float64

将数据写入文本格式 ¶

data = pd.read_csv('../examples/ex5.csv')
print(data)
#   something  a   b     c   d message
# 0       one  1   2   3.0   4     NaN
# 1       two  5   6   NaN   8   world
# 2     three  9  10  11.0  12     foo

使用DataFrame的to_csv方法，将数据导出为逗号分隔的文件

data.to_csv('../examples/out.csv')
# 输出out.csv的内容
# ,something,a,b,c,d,message
# 0,one,1,2,3.0,4,
# 1,two,5,6,,8,world
# 2,three,9,10,11.0,12,foo

使用DataFrame的to_csv方法，将数据导出为其他的分隔符的文件

data.to_csv(sys.stdout, sep='|')
# |something|a|b|c|d|message
# 0|one|1|2|3.0|4|
# 1|two|5|6||8|world
# 2|three|9|10|11.0|12|foo
data.to_csv(sys.stdout, sep=',')
# ,something,a,b,c,d,message
# 0,one,1,2,3.0,4,
# 1,two,5,6,,8,world
# 2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout, sep=',', na_rep='NULL')  # 设定缺失值在输出时以空字符串出现
# ,something,a,b,c,d,message
# 0,one,1,2,3.0,4,NULL
# 1,two,5,6,NULL,8,world
# 2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout, sep=',', na_rep='NULL', index=False, header=False)  # 不输出行和列的标签（index，header）
# one,1,2,3.0,4,NULL
# two,5,6,NULL,8,world
# three,9,10,11.0,12,foo
data.to_csv(sys.stdout, sep=',', na_rep='NULL', index=False, header=False, columns=['a', 'b', 'c'])  # 按照自定的顺序输出子集
# 1,2,3.0
# 5,6,NULL
# 9,10,11.0

Series也有to_csv方法

dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('../examples/tseries.csv', header=False)
# 输出tseries.csv文件内容
# 2000-01-01,0
# 2000-01-02,1
# 2000-01-03,2
# 2000-01-04,3
# 2000-01-05,4
# 2000-01-06,5
# 2000-01-07,6

使用分隔格式 ¶

绝大多数的表型数据都可以使用函数pandas.read_table从硬盘中读取。

然而，在某些情况下，接收一个带有一行或多行错误的文件并不少见，read_table也无法解决这种情况。

ex7.csv 文件内容

"a","b","c"
"1","2","3"
"1","2","3"

f = open('../examples/ex7.csv')  # 使用Python的内建csv模块
reader = csv.reader(f)  # 将任一打开的文件或文件型对象传给csv.reader
for line in reader:  # # 遍历reader，产生元组，元组的值为删除了引号的字符
    print(line)
f.close()
# ['a', 'b', 'c']
# ['1', '2', '3']
# ['1', '2', '3']

with open('../examples/ex7.csv') as f:
    lines = list(csv.reader(f))  # 首先，将文件读取为行的列表
    header, values = lines[0], lines[1:]  # 其次，将数据拆分为列名行和数据行
    data_dict = {
        h: v for h, v in zip(header, zip(*values))  # 再然后，使用字典推导式和表达式zip(*values)生成一个包含数据列的字典，字典中行转置成列
    }
print(data_dict)  # 输出结果
# {'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

如果需根据不同的分隔符、字符串引用约定或行终止符定义一种新的格式时，可以:

方法1：使用csv.Dialect定义一个简单的子类

class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'  # 这里只能是一个字符
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL


f = open('../examples/ex7.csv')
reader = csv.reader(f, dialect=my_dialect)
for line in reader:  # 遍历reader，产生元组，元组的值为删除了引号的字符
    print(line)
f.close()
# ['a,"b","c"']
# ['1,"2","3"']
# ['1,"2","3"']

方法2：直接将CSV方言参数(dialect)传入csv.reader的关键字参数。比较详细的介绍方言和分隔符

f = open('../examples/ex7.csv')
reader = csv.reader(f, delimiter='|')
for line in reader:  # 遍历reader，产生元组，元组的值为删除了引号的字符
    print(line)
f.close()
# ['a,"b","c"']
# ['1,"2","3"']
# ['1,"2","3"']

对于具有更复杂或固定的多字符分隔符的文件，将无法使用csv模块。在此类情况下，将使用字符串的split方法或正则表达式方法re.split进行行拆分和其他清理工作。需要手动写入被分隔的文件时，你可以使用csv.writer。这个函数接收一个已经打开的可写入文件对象以及和csv.reader相同的CSV方言、格式选项.

with open('../examples/mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))
    writer.writerow(('10', '11', '12'))

# mydata.csv 文件内容
# 1;2;3
# 4;5;6
# 7;8;9
# 10;11;12

JSON数据 ¶

obj = """
{
    "name": "Wes",
    "places_lived": ["United States", "Spain", "Germany"],
    "pet": null,
    "siblings": [
        {
            "name": "Scott",
            "age": 30,
            "pets": ["Zeus", "Zuko"]
        },
        {
            "name": "Katie",
            "age": 38,
            "pets": ["Sixes", "Stache", "Cisco"]
        }
    ]
}
"""

将JSON字符串转换为Python形式时，使用json.loads方法

result = json.loads(obj)
print(result)
# {'name': 'Wes', 'places_lived': ['United States', 'Spain', 'Germany'], 'pet': None, 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']}, {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

另一方面，json.dumps可以将Python对象转换回JSON

asjson = json.dumps(result)
print(asjson)
# {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}

将JSON对象或对象列表转换为DataFrame或其他数据结构。

比较方便的方式是将字典构成的列表（之前是JSON对象）传入DataFrame构造函数，并选出数据字段的子集。

siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
print(siblings)
#     name  age
# 0  Scott   30
# 1  Katie   38

pandas.read_json可以自动将JSON数据集按照指定次序转换为Series或DataFrame。 pandas.read_json的默认选项是假设JSON数组中的每个对象是表里的一行。

例如读取 data = pd.read_json('../examples/example_new.json')

data = pd.read_json('../examples/example.json')
print(data)
#    a  b  c
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9
print(data.to_json())
# {"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
print(data.to_json(orient='records'))
# [{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]

XML和HTML：网络抓取 ¶

pandas的内建函数read_html可以使用lxml和Beautiful Soup等库将HTML中的表自动解析为DataFrame对象。

tables = pd.read_html('../examples/fdic_failed_bank_list.html')
print(len(tables))
# 1

failures = tables[0]  # //*[@id="table"]
print(failures.head())  # 读取前5行记录
#                       Bank Name  ...       Updated Date
# 0                   Allied Bank  ...  November 17, 2016
# 1  The Woodbury Banking Company  ...  November 17, 2016
# 2        First CornerStone Bank  ...  September 6, 2016
# 3            Trust Company Bank  ...  September 6, 2016
# 4    North Milwaukee State Bank  ...      June 16, 2016
#
# [5 rows x 7 columns]

close_timestamps = pd.to_datetime(failures['Closing Date'])  # 计算每年银行倒闭的数量
print(close_timestamps.dt.year.value_counts())
# 2010    157
# 2009    140
# 2011     92
# 2012     51
# 2008     25
#        ...
# 2004      4
# 2001      4
# 2007      3
# 2003      3
# 2000      2
# Name: Closing Date, Length: 15, dtype: int64