Using MongoSpark, running the same code on 2 different datasets of differing sizes causing one to throw the E11000 duplicate key error .

Before we proceed, here is the code below:

object ScrapeHubCompanyImporter { def importData(path: String, companyMongoUrl: String): Unit = { val spark = SparkSession.builder() .master("local[*]") .config("spark.mongodb.input.uri", companyMongoUrl) .config("spark.mongodb.output.uri", companyMongoUrl) .config("spark.mongodb.input.partitionerOptions.partitionKey", "profileUrl") .getOrCreate() import spark.implicits._ val websiteToDomainTransformer = udf((website: String) => { val tldExtract = SplitHost.fromURL(website) if (tldExtract.domain == "") { null } else { tldExtract.domain + "." + tldExtract.tld } }) val jsonDF = spark .read .json(path) .filter { row => row.getAs[String]("canonical_url") != null } .dropDuplicates(Seq("canonical_url")) .select( toHttpsUdf($"canonical_url").as("profileUrl"), $"city", $"country", $"founded", $"hq".as("headquartes"), $"industry", $"company_id".as("companyId"), $"name", $"postal", $"size", $"specialties", $"state", $"street_1", $"street_2", $"type", $"website" ) .filter { row => row.getAs[String]("website") != null } .withColumn("domain", websiteToDomainTransformer($"website")) .filter(row => row.getAs[String]("domain") != null) .as[ScrapeHubCompanyDataRep] val jsonColsSet = jsonDF.columns.toSet val mongoData = MongoSpark .load[LinkedinCompanyRep](spark) .withColumn("companyUrl", toHttpsUdf($"companyUrl")) .as[CompanyRep] val mongoColsSet = mongoData.columns.toSet val union = jsonDF.joinWith( mongoData, jsonDF("companyUrl") === mongoData("companyUrl"), joinType = "left") .map { t => val scrapeHub = t._1 val liCompanyRep = if (t._2 != null ) { t._2 } else { CompanyRep(domain = scrapeHub.domain) } CompanyRep( _id = pickValue(liCompanyRep._id, None), city = pickValue(scrapeHub.city, liCompanyRep.city), country = pickValue(scrapeHub.country, liCompanyRep.country), postal = pickValue(scrapeHub.postal, liCompanyRep.postal), domain = scrapeHub.domain, founded = pickValue(scrapeHub.founded, liCompanyRep.founded), headquartes = pickValue(scrapeHub.headquartes, liCompanyRep.headquartes), headquarters = liCompanyRep.headquarters, industry = pickValue(scrapeHub.industry, liCompanyRep.industry), linkedinId = pickValue(scrapeHub.companyId, liCompanyRep.companyId), companyUrl = Option(scrapeHub.companyUrl), name = pickValue(scrapeHub.name, liCompanyRep.name), size = pickValue(scrapeHub.size, liCompanyRep.size), specialties = pickValue(scrapeHub.specialties, liCompanyRep.specialties), street_1 = pickValue(scrapeHub.street_1, liCompanyRep.street_1), street_2 = pickValue(scrapeHub.street_2, liCompanyRep.street_2), state = pickValue(scrapeHub.state, liCompanyRep.state), `type` = pickValue(scrapeHub.`type`, liCompanyRep.`type`), website = pickValue(scrapeHub.website, liCompanyRep.website), updatedDate = None, scraped = Some(true) ) } val idToMongoId = udf { st: String => if (st != null) { ObjectId(st) } else { null } } val saveReady = union .map { rep => rep.copy( updatedDate = Some(new Timestamp(System.currentTimeMillis)), scraped = Some(true), headquarters = generateCompanyHeadquarters(rep) ) } .dropDuplicates(Seq("companyUrl")) MongoSpark.save( saveReady.withColumn("_id", idToMongoId($"_id")), WriteConfig(Map( "uri" -> companyMongoUrl ))) } def generateCompanyHeadquarters(companyRep: CompanyRep): Option[CompanyHeadquarters] = { val hq = CompanyHeadquarters( country = companyRep.country, geographicArea = companyRep.state, city = companyRep.city, postalCode = companyRep.postal, line1 = companyRep.street_1, line2 = companyRep.street_2 ) CompanyHeadquarters .unapply(hq) .get .productIterator.toSeq.exists { case a: Option[_] => a.isDefined case _ => false } match { case true => Some(hq) case false => None } } def pickValue(left: Option[String], right: Option[String]): Option[String] = { def _noneIfNull(opt: Option[String]): Option[String] = { if (opt != null) { opt } else { None } } val lOpt = _noneIfNull(left) val rOpt = _noneIfNull(right) lOpt match { case Some(l) => Option(l) case None => rOpt match { case Some(r) => Option(r) case None => None } } } }

This issue is around the companyUrl which is one of the unique keys in the collection, the other being the _id key. The issue is that there are tons of duplicates that Spark will attempt to save on a 700gb dataset, but if I run a very small dataset locally, Im never able to replicate the issue. Im trying to understand whats going on, and how can I make sure to group all the existing companies on the companyUrl , and make sure that duplicates really are removed globally across the dataset.

EDIT Here are some scenarios that arise:

Company is in Mongo, the file thats read has updated data -> Duplicate key error can occur here Company not in Mongo but in file -> Duplicate key error can occur here as well.

EDIT2 The duplication error occurs around companyUrl field.