mongodb - Merge duplicates and remove the oldest -
i have collection there duplicate documents. in example:
first document:
{ "_id" : objectid("56f3d7cc1de31cb20c08ae6b"), "addeddate" : isodate("2016-05-01t00:00:00.000z"), "place": "thisplace", "presentindb" : [ { "indb" : isodate("2016-05-01t00:00:00.000z") } ], "checked" : [], "link": "http://www.mylink.com/first/84358" }
second document:
{ "_id" : objectid("577740526c1e542904725238"), "addeddate" : isodate("2016-05-02t00:00:00.000z"), "place": "thisplace", "presentindb" : [ { "indb" : isodate("2016-05-02t00:00:00.000z") }, { "indb" : isodate("2016-05-03t00:00:00.000z") } ], "checked" : [ { "done" : isodate("2016-05-02t00:00:00.000z") }, ], "link": "http://www.mylink.com/second/84358" }
link
field contains same sequense of numbers in both documents, 84358
.
so achieve steps:
- loop on each document in collection.
- match number sequence in each document in
link
field (i.e.84358
above) , if there several documents in collection have sequence inlink
field. , ifplace
field match in both documents: - merge
presentindb
,checked
fields - > mergepresentindb
,checked
fields adding array values newest document (by date inaddeddate
field) oldest document. - remove newest document.
how achieve such query?
in mongodb 3.3.6 release introduced $split
operator dealing strings in aggregation framework (jira). before release solve map/reduce solution.
after mongodb 3.3.6 release: aggregation framework solution
db.duplicatedcollection.aggregate( [ { $project: { _id : 1, addeddate : 1, place : 1, presentindb : 1, checked : 1, link : 1, sequencenumber: { $arrayelemat: [ {$split: ["$link", "/"]}, -1 ]}, } }, { $sort: { addeddate: 1 } }, { $group: { _id : { sequencenumber : "$sequencenumber", place : "$place" }, id : { $first: "$_id"}, addeddate: { $first: "$addeddate" }, place : { $first: "$place" }, presentindb: { $push: '$presentindb' }, checked: { $push: '$checked' }, link: { $first: "$link"} } }, { $unwind: "$presentindb" }, { $unwind: { path : "$presentindb", preservenullandemptyarrays: true } }, { $unwind: "$checked" }, { $unwind: { path : "$checked", preservenullandemptyarrays: true } }, { $group: { _id : "$id", addeddate: { $first: "$addeddate" }, place : { $first: "$place" }, presentindb : { $addtoset: '$presentindb' }, checked : { $addtoset: '$checked' }, link: { $first: "$link"} } }, { $out: "duplicatedcollection" } ] );
before mongodb 3.3.6 release: map/reduce solution
map function:
var mapfunction = function() { var linkarray = this.link.split("/"); var sequencenumber = linkarray[linkarray.length - 1]; var keydoc = { place : this.place, sequencenumber: sequencenumber, }; emit(keydoc, this); };
reduce function:
var reducefunction = function(key, values) { var reduceddoc = {}; reduceddoc._id = values[0]._id; reduceddoc.addeddate = values[0].addeddate; reduceddoc.link = values[0].link; reduceddoc.presentindb = []; reduceddoc.checked = []; var presentindbmillisarray = []; var checkedmillisarray = []; values.foreach(function(doc) { if (reduceddoc.addeddate < doc.addeddate) { reduceddoc._id = doc._id; reduceddoc.addeddate = doc.addeddate; reduceddoc.link = doc.link; } // presentindb field merge doc.presentindb.foreach(function(presentindbelem) { var millis = presentindbelem.indb.gettime(); if (!array.contains(presentindbmillisarray, millis)) { reduceddoc.presentindb.push(presentindbelem); presentindbmillisarray.push(millis); } }); // same here checked field doc.checked.foreach(function(checkedelem) { var millis = checkedelem.done.gettime(); if (!array.contains(checkedmillisarray, millis)) { reduceddoc.checked.push(checkedelem); checkedmillisarray.push(millis); } }); }); return reduceddoc; };
map/reduce:
db.duplicatedcollection.mapreduce( mapfunction, reducefunction, { "out": "duplicatedcollection" } );
unwrap value map/reduce returned documents:
db.duplicatedcollection.find( { value : { $exists: true } } ).foreach(function(doc) { db.duplicatedcollection.insert(doc.value); db.duplicatedcollection.remove({_id : doc._id}); });
Comments
Post a Comment