There are times when you are working on testing a functionality but that is dependent on some external service. For that either you have to start the service by yourself while testing (not recommended) or embed a service like miniDFS that provides a sudo cluster functionality while your test case is running.

SBT dependencies

libraryDependencies ++= Seq( "org.scalatest" %% "scalatest" % "3.0.4", "org.apache.hadoop" % "hadoop-hdfs" % "2.8.1" % "compile,test" classifier "" classifier "tests", "org.apache.hadoop" % "hadoop-common" % "2.8.1" % "compile,test" classifier "" classifier "tests", "org.apache.hadoop" % "hadoop-minicluster" % "2.8.1" % "compile,test" )

A HDFSCluster trait which consists of functions responsible for startingHDFS, getting naneNodeURI and shuttingHDFS

import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hdfs.MiniDFSCluster import org.apache.hadoop.test.PathUtils trait HDFSCluster { @transient private var hdfsCluster: MiniDFSCluster = _ def startHDFS: Unit = { println("Starting HDFS Cluster...") val baseDir = new File(PathUtils.getTestDir(getClass()), "miniHDFS") val conf = new Configuration() conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()) conf.setBoolean("dfs.webhdfs.enabled", true) val builder = new MiniDFSCluster.Builder(conf) hdfsCluster = builder.nameNodePort(9000).manageNameDfsDirs(true).manageDataDfsDirs(true).format(true).build() hdfsCluster.waitClusterUp() } def getNameNodeURI: String = "hdfs://localhost:" + hdfsCluster.getNameNodePort def shutdownHDFS: Unit = { println("Shutting down HDFS Cluster...") hdfsCluster.shutdown } }

A generic HDFSHelper class which serialize and deserialize data while reading and writing data in HDFS

import java.io.{ ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream } import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FSDataInputStream, FileSystem, Path } import scala.util.{ Failure, Success, Try } case class HDFSHelper[T](uri: String) extends Serializable { val conf = new Configuration() conf.set("fs.defaultFS", uri) val hdfs: FileSystem = FileSystem.get(conf) def write(data: T, filePath: String): Unit = { Try { val path = new Path(filePath) hdfs.create(path) } match { case Success(dataOutputStream) => dataOutputStream.write(serialize(data)) dataOutputStream.close() case Failure(e) => e.printStackTrace() } } def read(filePath: String): T = { Try { val path = new Path(filePath) val inputStream: FSDataInputStream = hdfs.open(path) val out = deserialize(IOUtils.toByteArray(inputStream)) inputStream.close() hdfs.close() out } match { case Success(value) => value case Failure(ex) => throw ex } } def serialize(data: T): Array[Byte] = { try { val byteOut = new ByteArrayOutputStream() val objOut = new ObjectOutputStream(byteOut) objOut.writeObject(data) objOut.close() byteOut.close() byteOut.toByteArray } catch { case ex: Exception => throw new Exception(ex.getMessage) } } def deserialize(bytes: Array[Byte]): T = { try { val byteIn = new ByteArrayInputStream(bytes) val objIn = new ObjectInputStream(byteIn) val obj = objIn.readObject().asInstanceOf[T] byteIn.close() objIn.close() obj } catch { case ex: Exception => throw new Exception(ex.getMessage) } } }

And now a test case in which I am extending HDFSCluster trait for starting and stopping HDFS. An object of HDFSHelper of type Int is created for reading and writing data from HDFS directory.

hdfsHelper.write(data, dir)

is writing data object of type Int and value 10 in the directory

hdfsHelper.read(dir)

is reading an object of type Int from the given directory

import miniCluster.{HDFSCluster, HDFSHelper} import org.scalatest.{BeforeAndAfterAll, WordSpec} class miniDFSClusterSpec extends WordSpec with HDFSCluster with BeforeAndAfterAll { override protected def beforeAll(): Unit = { startHDFS } override protected def afterAll(): Unit = { shutdownHDFS } "miniDFSClusterSpec" should { "write and read data from miniDFS cluster" in { val url = getNameNodeURI val dir = getNameNodeURI + "/user" val hdfsHelper = new HDFSHelper[Int](url) val data: Int = 10 hdfsHelper.write(data, dir) val result = hdfsHelper.read(dir) assert(data == result) } } }