mongodb - Merge duplicates and remove the oldest -


i have collection there duplicate documents. in example:

first document:

{     "_id" : objectid("56f3d7cc1de31cb20c08ae6b"),     "addeddate" : isodate("2016-05-01t00:00:00.000z"),     "place": "thisplace",     "presentindb" : [          {             "indb" : isodate("2016-05-01t00:00:00.000z")         }      ],     "checked" : [],     "link": "http://www.mylink.com/first/84358" } 

second document:

{     "_id" : objectid("577740526c1e542904725238"),     "addeddate" : isodate("2016-05-02t00:00:00.000z"),     "place": "thisplace",     "presentindb" : [          {             "indb" : isodate("2016-05-02t00:00:00.000z")         },         {             "indb" : isodate("2016-05-03t00:00:00.000z")         }      ],     "checked" : [         {             "done" : isodate("2016-05-02t00:00:00.000z")         },     ],     "link": "http://www.mylink.com/second/84358" } 

link field contains same sequense of numbers in both documents, 84358.

so achieve steps:

  1. loop on each document in collection.
  2. match number sequence in each document in link field (i.e. 84358 above) , if there several documents in collection have sequence in link field. , if place field match in both documents:
  3. merge presentindb , checked fields - > merge presentindb , checked fields adding array values newest document (by date in addeddate field) oldest document.
  4. remove newest document.

how achieve such query?

in mongodb 3.3.6 release introduced $split operator dealing strings in aggregation framework (jira). before release solve map/reduce solution.

after mongodb 3.3.6 release: aggregation framework solution

db.duplicatedcollection.aggregate(   [     {       $project: {         _id : 1,         addeddate : 1,         place : 1,         presentindb : 1,         checked : 1,         link : 1,         sequencenumber: { $arrayelemat: [ {$split: ["$link", "/"]}, -1 ]},       }     },      {       $sort: { addeddate: 1 }     },     {       $group: {         _id : {           sequencenumber : "$sequencenumber",           place : "$place"         },         id : { $first: "$_id"},         addeddate: { $first: "$addeddate" },         place :  { $first: "$place" },         presentindb: {           $push: '$presentindb'         },         checked: {           $push: '$checked'         },         link: { $first: "$link"}       }     },     {       $unwind: "$presentindb"     },     {       $unwind: {         path : "$presentindb",         preservenullandemptyarrays: true       }         },     {       $unwind: "$checked"     },     {       $unwind: {         path : "$checked",         preservenullandemptyarrays: true       }     },         {       $group: {         _id : "$id",         addeddate: { $first: "$addeddate" },                 place :  { $first: "$place" },         presentindb : {           $addtoset: '$presentindb'         },         checked : {           $addtoset: '$checked'         },                 link: { $first: "$link"}       }     },       {       $out: "duplicatedcollection"     }   ] ); 

before mongodb 3.3.6 release: map/reduce solution

map function:

var mapfunction = function() {     var linkarray = this.link.split("/");     var sequencenumber = linkarray[linkarray.length - 1];      var keydoc = {         place : this.place,        sequencenumber: sequencenumber,      };      emit(keydoc, this); }; 

reduce function:

var reducefunction = function(key, values) {        var reduceddoc = {};     reduceddoc._id = values[0]._id;     reduceddoc.addeddate = values[0].addeddate;     reduceddoc.link = values[0].link;     reduceddoc.presentindb = [];     reduceddoc.checked = [];      var presentindbmillisarray = [];     var checkedmillisarray = [];              values.foreach(function(doc) {         if (reduceddoc.addeddate < doc.addeddate) {             reduceddoc._id = doc._id;             reduceddoc.addeddate = doc.addeddate;             reduceddoc.link = doc.link;         }           // presentindb field merge         doc.presentindb.foreach(function(presentindbelem) {             var millis = presentindbelem.indb.gettime();             if (!array.contains(presentindbmillisarray, millis)) {                  reduceddoc.presentindb.push(presentindbelem);                 presentindbmillisarray.push(millis);             }         });          // same here checked field         doc.checked.foreach(function(checkedelem) {             var millis = checkedelem.done.gettime();             if (!array.contains(checkedmillisarray, millis)) {                  reduceddoc.checked.push(checkedelem);                 checkedmillisarray.push(millis);             }         });     });     return reduceddoc; }; 

map/reduce:

db.duplicatedcollection.mapreduce(     mapfunction,     reducefunction,     {          "out": "duplicatedcollection"     } ); 

unwrap value map/reduce returned documents:

   db.duplicatedcollection.find(     {         value : {             $exists: true          }     }     ).foreach(function(doc) {         db.duplicatedcollection.insert(doc.value);         db.duplicatedcollection.remove({_id : doc._id});     }); 

Comments

Popular posts from this blog

jOOQ update returning clause with Oracle -

java - Warning equals/hashCode on @Data annotation lombok with inheritance -

java - BasicPathUsageException: Cannot join to attribute of basic type -