转载自:https://www.jianshu.com/p/e4c90dc08935sql
经过Spark将关系型数据库(以Oracle为例)的表同步的Hive表,要求用Spark建表,有字段注释的也要加上注释。Spark建表,有两种方法:shell
前面讲到DataFrame里没有Oracle的注释信息,可是若是数据源为Hive的话,是能够将注释获取到的。数据库
create table `test` ( `id` string comment 'ID', `Name` string comment '名字' ) comment '测试';
首先看一下df.printSchema里并无注释信息apache
sql("use test") val df = spark.table("test") df.printSchema
root |-- id: string (nullable = true) |-- name: string (nullable = true)
用下面这行代码即可以打印注释信息:oracle
df.schema.foreach(s=>println(s.name,s.metadata))
(id,{"comment":"ID","HIVE_TYPE_STRING":"string"}) (name,{"comment":"名字","HIVE_TYPE_STRING":"string"})
CREATE TABLE ORA_TEST ( ID VARCHAR2(100), NAME VARCHAR2(100) ); COMMENT ON COLUMN ORA_TEST.ID IS 'ID'; COMMENT ON COLUMN ORA_TEST.NAME IS '名字'; COMMENT ON TABLE ORA_TEST IS '测试';
代码:app
package com.dkl.leanring.spark.sql.Oracle import org.apache.spark.sql.SparkSession object OracleSchemaDemo { def main(args: Array[String]): Unit = { val spark = SparkSession.builder().appName("OracleSchemaDemo").master("local").getOrCreate() val df = spark.read .format("jdbc") .option("url", "jdbc:oracle:thin:@192.168.44.128:1521:orcl") .option("dbtable", "ORA_TEST") .option("user", "bigdata") .option("password", "bigdata") .option("driver", "oracle.jdbc.driver.OracleDriver") .load() df.schema.foreach(s => println(s.name, s.metadata)) spark.stop } }
(ID,{"name":"ID","scale":0}) (NAME,{"name":"NAME","scale":0})
注:Spark2.3.0和Spark2.2.1的元数据不太同样,上面的结果是Spark2.2.1(也是我写博客测试用的),项目中用的Spark2.3.0,2.3.0的元数据是空的,以下工具
(ID,{})
(NAME,{})
可见并无注释信息测试
import org.apache.spark.sql.types._ val commentMap = Map("ID" -> "ID", "NAME" -> "名字") val schema = df.schema.map(s => { s.withComment(commentMap(s.name)) }) //根据添加了注释的schema,新建DataFrame
val new_df = spark.createDataFrame(df.rdd, StructType(schema)).repartition(160) new_df.schema.foreach(s => println(s.name, s.metadata))
(ID,{"comment":"ID","name":"ID","scale":0}) (NAME,{"comment":"名字","name":"NAME","scale":0})
需将前面代码中的spark改成支持hive,即加上enableHiveSupport()ui
spark.sql("use test") new_df.write.mode("overwrite").saveAsTable("ORA_TEST")
而后在hive里看一下,是否有注释url
能够看到,成功的把注释也保存到里hive里
附上在Eclipse运行的完整代码
package com.dkl.leanring.spark.sql.Oracle import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types._ object OracleSchemaDemo { def main(args: Array[String]): Unit = { val spark = SparkSession.builder().appName("OracleSchemaDemo").master("local").enableHiveSupport().getOrCreate() val df = spark.read .format("jdbc") .option("url", "jdbc:oracle:thin:@192.168.44.128:1521:orcl") .option("dbtable", "ORA_TEST") .option("user", "bigdata") .option("password", "bigdata") .option("driver", "oracle.jdbc.driver.OracleDriver") .load() df.schema.foreach(s => println(s.name, s.metadata)) val commentMap = Map("ID" -> "ID", "NAME" -> "名字") val schema = df.schema.map(s => { s.withComment(commentMap(s.name)) }) //根据添加了注释的schema,新建DataFrame
val new_df = spark.createDataFrame(df.rdd, StructType(schema)).repartition(160) new_df.schema.foreach(s => println(s.name, s.metadata)) spark.sql("use test") //保存到hive
new_df.write.mode("overwrite").saveAsTable("ORA_TEST") spark.stop } }