用pyspark 自带的 jsonFile、load 命令导入JSON文件,若是文件中存在换行回车符,会出现“corrupt_record"错误。若是没有则可以正常导入。python
解决办法:能够用python自带的JSON.loads命令导入为list后,再转为dataframe。sql
fp = open('C:\\Temp\\ColumnConfig_6009.json','r') fp1=fp.read() fp2=fp1.replace("\n", " ") list(fp2) # '[ {"name":"Andy", "age":30}, {"name":"Justin", "age":19} ]' it's a str, not list ############# Impor & Convert JSON to dataframe fp = open('C:\\Temp\\ColumnConfig_6009.json','r') data = json.loads(fp.read()) data=sqlContext.createDataFrame(data) data.registerTempTable("data") fp.close() sql1=sqlContext.sql("select * from data ") sql1.count() # 2 d = [{'name': 'Alice', 'age': 1}] sqlContext.createDataFrame(d).collect() d= [ {"name":"Andy", "age":30}, {"name":"Justin", "age":19} ] sqlContext.createDataFrame(d).collect() ################################################### dataframe=sqlContext.load("C:\\Temp\\ColumnConfig_6009.json", "json") dataframe.printSchema() dataframe.first() dataframe.select("age").show(2) dataframe.count() # 2 dataframe.filter(dataframe.age<20).show() dataframe.filter(dataframe.name=="Andy").show() ################################################### from pyspark.sql.types import * lines=sc.textFile("C:\\Temp\\Trans_20150101.txt") line1=lines.map(lambda x: x.split("|")) line1.take(5) ################################################### ## JSON Parsing import json fp = open('C:\\Temp\\ColumnConfig_6009.json','r') data = json.loads(fp.read()) fp.close() print data # list with open('C:\\Temp\\ColumnConfig_6009.json') as json_file: raw_data = json_file.read() data = json.loads(raw_data) #data = json.loads("".join(raw_data.split()))