python读取HDFS文件 - Go语言中文社区

python读取HDFS文件


###方法一:使用hdfs库读取HDFS文件
###在读取数据时,要加上 encoding='utf-8',否则字符串前面会有b'xxx'
###先写入list,再转为df,注意要对数据进行分列,最后要对指定字段转换数据类型
from hdfs.client import Client
client = Client("http://hadoop-1-1:50070")

lines = []
with client.read("/user/spark/H2O/Wholesale_customers_data.csv", encoding='utf-8') as reader:
    for line in reader:  
        lines.append(line.strip())

column_str = lines[0]
column_list = column_str.split(',')

data = {"item_list":lines[1:]}

import pandas as pd
df = pd.DataFrame(data=data)
df[column_list] =  df["item_list"].apply(lambda x: pd.Series([i for i in x.split(",")]))  ##重新指定列
df.drop("item_list", axis=1, inplace=True)  ##删除列

df.dtypes
"""
Region              object
Fresh               object
Milk                object
Grocery             object
Frozen              object
Detergents_Paper    object
Delicassen          object
target              object
dtype: object

"""


df = df.astype('int')  ##将object类型转为int64
df.dtypes
"""
Region              int64
Fresh               int64
Milk                int64
Grocery             int64
Frozen              int64
Detergents_Paper    int64
Delicassen          int64
target              int64
dtype: object
"""
###方法二:采用pydoop库读取HDFS文件
import pydoop.hdfs as hdfs

lines = []
with hdfs.open('/user/spark/security/iris.csv', 'rt') as f:
    for line in f:
        ##print(line)
        lines.append(line.strip())


column_list = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']

data = {"item_list":lines[0:]}

import pandas as pd
df = pd.DataFrame(data=data)
df[column_list] =  df["item_list"].apply(lambda x: pd.Series([i for i in x.split(",")]))  ##重新指定列
df.drop("item_list", axis=1, inplace=True)  ##删除列

##调整数据类型
df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] = df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']].astype('float64')

df.dtypes
"""
Sepal_Length    float64
Sepal_Width     float64
Petal_Length    float64
Petal_Width     float64
Species          object
dtype: object
"""
###直接运用pd.read_table进行数据读取操作
import pydoop.hdfs as hdfs
import pandas as pd

###此份数据含有表头
with hdfs.open('/user/spark/security/iris.csv', 'rt') as f:
    df = pd.read_table(f)


column_list = df.columns[0].split(",")
df[column_list] =  df.iloc[:,0].apply(lambda x: pd.Series([i for i in x.split(",")]))  ##此处注意要写成df.iloc[:,0]

df.head()
"""
Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species	Sepal_Length	Sepal_Width	Petal_Length	Petal_Width	Species
0	5.1,3.5,1.4,0.2,setosa	5.1	3.5	1.4	0.2	setosa
1	4.9,3,1.4,0.2,setosa	4.9	3	1.4	0.2	setosa
2	4.7,3.2,1.3,0.2,setosa	4.7	3.2	1.3	0.2	setosa
3	4.6,3.1,1.5,0.2,setosa	4.6	3.1	1.5	0.2	setosa
4	5,3.6,1.4,0.2,setosa	5	3.6	1.4	0.2	setosa
"""


df.drop(df.columns[0], axis=1, inplace=True)
df.dtypes
"""
Sepal_Length    object
Sepal_Width     object
Petal_Length    object
Petal_Width     object
Species         object
dtype: object
"""


#####将'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'这四个字段转换为float类型
df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] = df[['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']].astype('float')

df.dtypes
"""
Sepal_Length    float64
Sepal_Width     float64
Petal_Length    float64
Petal_Width     float64
Species          object
dtype: object
"""

转载于:https://my.oschina.net/kyo4321/blog/3016864

版权声明:本文来源CSDN,感谢博主原创文章,遵循 CC 4.0 by-sa 版权协议,转载请附上原文出处链接和本声明。
原文链接:https://blog.csdn.net/weixin_34185364/article/details/91902549
站方申明:本站部分内容来自社区用户分享,若涉及侵权,请联系站方删除。
  • 发表于 2020-06-28 03:05:48
  • 阅读 ( 1170 )
  • 分类:

0 条评论

请先 登录 后评论

官方社群

GO教程

猜你喜欢