首页 新闻 会员 周边 捐助

关于spark写oracle遇到的问题

0
[待解决问题]

1
package com.pingan.pilot.bridge.spark_etl 2 3 /** 4 * 5 */ 6 import com.pingan.pilot.bridge.common.{OperationOracleUtil, Logging, Utils} 7 import org.apache.spark.sql.{DataFrame, SparkSession} 8 import org.apache.spark.sql.functions._ 9 import java.io.File 10 import java.sql.{DriverManager, Connection} 11 import org.apache.spark.sql.types.StructType 12 import java.sql.{Connection, PreparedStatement, Statement} 13 import com.pingan.pilot.bridge.db.{DataSourceFactory, OracleProperties, SingleThreadConnectionHolder, TransactionManager} 14 ////////// 15 object common_etl extends Logging { 16 def main (args: Array[String]) { 17 val spark = OperationOracleUtil.get_sparksession() 18 val hiveDF = spark.sql("select * from test limit 3") 19 logInfo("hiveDF = "+hiveDF.show()) 20 hiveDF.foreachPartition(data_insert _ ) 21 22 def data_insert(iterator: Iterator[org.apache.spark.sql.Row]): Unit = { 23 val conn = OperationOracleUtil.get_connection() 24 logInfo("conn is Ok") 25 var ps:PreparedStatement = null 26 ps = OperationOracleUtil.getInsertStatement(conn, "zqh_etl", hiveDF.schema) 27 logInfo("InsertStatement is ok ") 28 try { 29 iterator.foreach(data => { 30 for ( i <- 0 until data.length ) 31 ps.setObject(i+1, data(i)) 32 ps.addBatch() 33 logInfo("addBatch is ok ") 34 ps.executeUpdate() 35 }) 36 } catch { 37 case e: Exception => println("oracle Exception") 38 } finally { 39 if (ps != null) { ps.close() } 40 if (conn != null) { conn.close() } 41 } 42 } 43 } 44 }

代码如上,我想用spark从hive到oracle写数据,了解到用foreachpartition可以提高写效率,但是出现如下报错,不知道哪里逻辑不对。刚入门的新手,请大家指教下。

报错信息:

Caused by: java.lang.NullPointerException
    at org.apache.spark.sql.Dataset.schema(Dataset.scala:410)
    at com.pingan.pilot.bridge.spark_etl.common_etl$.com$pingan$pilot$bridge$spark_etl$common_etl$$data_insert$1(common_etl.scala:26)
    at com.pingan.pilot.bridge.spark_etl.common_etl$$anonfun$main$2.apply(common_etl.scala:20)
    at com.pingan.pilot.bridge.spark_etl.common_etl$$anonfun$main$2.apply(common_etl.scala:20)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:925)
    at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:925)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
    at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
    at org.apache.spark.scheduler.Task.run(Task.scala:99)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
hongma的主页 hongma | 初学一级 | 园豆:179
提问于:2018-02-23 11:05
< >
分享
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册