1 package com.pingan.pilot.bridge.spark_etl 2 3 /** 4 * 5 */ 6 import com.pingan.pilot.bridge.common.{OperationOracleUtil, Logging, Utils} 7 import org.apache.spark.sql.{DataFrame, SparkSession} 8 import org.apache.spark.sql.functions._ 9 import java.io.File 10 import java.sql.{DriverManager, Connection} 11 import org.apache.spark.sql.types.StructType 12 import java.sql.{Connection, PreparedStatement, Statement} 13 import com.pingan.pilot.bridge.db.{DataSourceFactory, OracleProperties, SingleThreadConnectionHolder, TransactionManager} 14 ////////// 15 object common_etl extends Logging { 16 def main (args: Array[String]) { 17 val spark = OperationOracleUtil.get_sparksession() 18 val hiveDF = spark.sql("select * from test limit 3") 19 logInfo("hiveDF = "+hiveDF.show()) 20 hiveDF.foreachPartition(data_insert _ ) 21 22 def data_insert(iterator: Iterator[org.apache.spark.sql.Row]): Unit = { 23 val conn = OperationOracleUtil.get_connection() 24 logInfo("conn is Ok") 25 var ps:PreparedStatement = null 26 ps = OperationOracleUtil.getInsertStatement(conn, "zqh_etl", hiveDF.schema) 27 logInfo("InsertStatement is ok ") 28 try { 29 iterator.foreach(data => { 30 for ( i <- 0 until data.length ) 31 ps.setObject(i+1, data(i)) 32 ps.addBatch() 33 logInfo("addBatch is ok ") 34 ps.executeUpdate() 35 }) 36 } catch { 37 case e: Exception => println("oracle Exception") 38 } finally { 39 if (ps != null) { ps.close() } 40 if (conn != null) { conn.close() } 41 } 42 } 43 } 44 }
代码如上,我想用spark从hive到oracle写数据,了解到用foreachpartition可以提高写效率,但是出现如下报错,不知道哪里逻辑不对。刚入门的新手,请大家指教下。
报错信息:
Caused by: java.lang.NullPointerException at org.apache.spark.sql.Dataset.schema(Dataset.scala:410) at com.pingan.pilot.bridge.spark_etl.common_etl$.com$pingan$pilot$bridge$spark_etl$common_etl$$data_insert$1(common_etl.scala:26) at com.pingan.pilot.bridge.spark_etl.common_etl$$anonfun$main$2.apply(common_etl.scala:20) at com.pingan.pilot.bridge.spark_etl.common_etl$$anonfun$main$2.apply(common_etl.scala:20) at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:925) at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:925) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:99) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745)