Spark RDD API 基本操作

云计算 waitig 478℃ 百度已收录 0评论

object RddTest {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("rdd test")
    val sc = new SparkContext(conf)
    //mapTest(sc)
    //val file= sc.textFile("/user-logs-large.txt")
    //file.flatMap(_.split("\\s")).foreach(println)
    //distinctTest(sc)
    // filterTest(sc)
    // keyByTest(sc)
    //sortByTest(sc)
    // rePartitionTest(sc)
    groupBy(sc)

    sc.stop()
  }

  //flatMap map mapPatitions
  def mapTest(sc: SparkContext) = {

    val file = sc.textFile("/user-logs-large.txt")
    val mapResult = file.map(x => {
      val info = x.split("\\s")
      (info(0), info(1))
    })

    //mapResult.take(10).foreach(println)
    //分区转换
    val mapPartitionResult = file.mapPartitions(x => {

      var info = new Array[String](3)
      for (line <- x) yield {
        info = line.split("\\s")
        (info(0), info(1))
      }
    })
    //mapPartitionResult.take(10).foreach(println)

    //通过转换把一条new_tweet的记录转换成2login的记录
    val flatMapResult = file.flatMap(x => {

      val info = x.split("\\s")
      info(0) match {
        case "new_tweet" => for (i <- 1 to 2) yield s"${info(0)} login ${info(2)}"
        case _ => Array(x)
      }
    })

    //flatMapResult.take(10).foreach(println)


  }

  //distinct排重
  def distinctTest(sc: SparkContext) = {
    val file = sc.textFile("/user-logs-large.txt", 3)
    val userRdd = file.map(_.split("\\t")(0)).distinct()
    userRdd.foreach(println)
  }

  //过滤
  def filterTest(sc: SparkContext) = {
    val file = sc.textFile("/user-logs-large.txt", 3)
    val loginFilter = file.filter(_.split("\\s")(1) == "login")
    loginFilter.foreach(println)
    println(loginFilter.count())
  }

  //keyBy 结果的key值是自定义的,v是原数据x
  def keyByTest(sc: SparkContext) = {
    val file = sc.textFile("/user-logs-large.txt", 3)
    val userActionType = file.keyBy(x => {
      val info = x.split("\\s")
      s"${info(0)}-----${info(1)}"
    })
    userActionType.foreach(println)
  }

  //sortBy排序
  def sortByTest(sc: SparkContext) = {
    val file = sc.textFile("/spark/part-00001")
    val sortByResult = file.sortBy(x => x.split("\\s+")(1).toInt)
    sortByResult.foreach(println)
  }

  //topN
  def topNTest(sc: SparkContext) = {
    val list = List(1, 2, 5, 11, 545, 22, 12, 55)
    val rdd = sc.parallelize(list, 2)

    val takeOrdered = rdd.takeOrdered(3)
    takeOrdered.foreach(println) //默认升序

    val topN = rdd.top(3)
    topN.foreach(println) //默认降序
  }

  //重新分区
  def rePartitionTest(sc: SparkContext) = {
    val file = sc.textFile("/user-logs-large.txt")
    val result = file.repartition(5)

    file.foreachPartition(x => {
      println(s"fileRdd分区,该分区数据:${x.size} ")
    })

    //reParttion分区 宽依赖
    result.foreachPartition(x => {
      var sum = 0
      x.foreach(x => sum += 1)
      println(s"resultRdd分区,该分区数据:$sum ")
    })


    //coalsce分区 窄依赖
    val coalResult = file.coalesce(3)
    coalResult.foreachPartition(x => {
      println(s"coalResultRdd:${x.size}")
    })
  }

  //groupBy
  def groupBy(sc: SparkContext) = {
    val file = sc.textFile("/user-logs-large.txt")
    val groupBy = file.groupBy(x => x.split("\\s")(0))
    groupBy.foreachPartition(x => {
      println(s"GroupByRdd分区,该分区数据:${x.size} ")
    })
    groupBy.foreach(x => {
      println(s"GroupByRdd的一条记录,key${x._1},value上集合的记录是:${x._2.size}")
    })

    //计算用户登录次数
   groupBy.foreach(x=>{
      var sum=0
      x._2.foreach(line=>{
        line.split("\\s")(1) match{
          case "login"=>sum+=1
          case _=>
        }
      })
     println(s"user:${x._1},logintimes:$sum")
   })


  }
  def aggSumTest(sc: SparkContext)={
    val list=List(3,5,8,9,12,55)
    val rdd=sc.parallelize(list,3)

    //reduce计算sum
    val reduceSum=rdd.reduce(_+_)
    //fold计算sum
    val foldRedult=rdd.fold(0)(_+_)
    //aggreagate把元素连成一个字符串
    val aggregateResult=rdd.aggregate("")((c,v)=>{
      c match{
        case ""=>v.toString
        case _=>s"$c,$v"
      }
    },(c1,c2)=>{
      c1 match{
        case ""=>c2
        case _=>s"$c1,$c2"
      }
    })

      println(s"reduceResult: $reduceSum")
      println(s"foldRedult: $foldRedult")
      println(s"aggregateResult: $aggregateResult")
  }

  //persist
  def persistTest(sc: SparkContext)={
    val file=sc.textFile("/user-logs-large.txt")
    file.cache()
    file.persist(StorageLevel.MEMORY_ONLY)
    //计算用户的数量
    file.map(x=>x.split("\\s")(0)).distinct().count
    //计算ip的数量
    file.map(x=>x.split("\\s")(2)).distinct().count

  

  }

本文由【waitig】发表在等英博客
本文固定链接:Spark RDD API 基本操作
欢迎关注本站官方公众号,每日都有干货分享!
等英博客官方公众号
点赞 (0)分享 (0)