Distinct This article contains example code snippets that show how to use the Distinct component.
On this page Filtering Duplicates with DistinctColumn
# This example demonstrates using the Distinct
component to eliminate duplicate records in a data flow. The DistinctColumn
attribute specifies which properties to use for identifying duplicates.
public class MyRow
{
[DistinctColumn]
public int Id { get ; set ; }
[DistinctColumn]
public string Value { get ; set ; }
public string TestId { get ; set ; }
}
var source = new MemorySource < MyRow >();
source . DataAsList . Add ( new MyRow () { Id = 1 , Value = "A" , TestId = "Test1" });
source . DataAsList . Add ( new MyRow () { Id = 2 , Value = "A" , TestId = "Test2" });
source . DataAsList . Add ( new MyRow () { Id = 2 , Value = "B" , TestId = "Test3" });
source . DataAsList . Add ( new MyRow () { Id = 1 , Value = "A" , TestId = "Test4" });
source . DataAsList . Add ( new MyRow () { Id = 2 , Value = "A" , TestId = "Test5" });
source . DataAsList . Add ( new MyRow () { Id = 3 , Value = "B" , TestId = "Test6" });
var trans = new Distinct < MyRow >();
var dest = new MemoryDestination < MyRow >();
source . LinkTo ( trans );
trans . LinkTo ( dest );
Network . Execute ( source );
foreach ( var row in dest . Data )
Console . WriteLine ( $"Id:{row.Id} Value:{row.Value} TestId:{row.TestId}" );
//Output
/*
Id:1 Value:A TestId:Test1
Id:2 Value:A TestId:Test2
Id:2 Value:B TestId:Test3
Id:3 Value:B TestId:Test6
*/
Redirecting Duplicates to an Alternate Flow# This example illustrates how to use the LinkDuplicatesTo
method to send duplicates to a different data flow.
var source = new MemorySource < MyRow >();
source . DataAsList . Add ( new MyRow () { Id = 1 , Value = "A" , TestId = "Test1" });
source . DataAsList . Add ( new MyRow () { Id = 2 , Value = "A" , TestId = "Test2" });
source . DataAsList . Add ( new MyRow () { Id = 2 , Value = "B" , TestId = "Test3" });
source . DataAsList . Add ( new MyRow () { Id = 1 , Value = "A" , TestId = "Test4" });
source . DataAsList . Add ( new MyRow () { Id = 2 , Value = "A" , TestId = "Test5" });
source . DataAsList . Add ( new MyRow () { Id = 3 , Value = "B" , TestId = "Test6" });
var trans = new Distinct < MyRow >();
var dest = new MemoryDestination < MyRow >();
var duplicateDest = new MemoryDestination < MyRow >();
source . LinkTo ( trans );
trans . LinkTo ( dest );
trans . LinkDuplicatesTo ( duplicateDest );
Network . Execute ( source );
foreach ( var row in dest . Data )
Console . WriteLine ( $"Id:{row.Id} Value:{row.Value} TestId:{row.TestId}" );
foreach ( var row in duplicateDest . Data )
Console . WriteLine ( $"Duplicate - Id:{row.Id} Value:{row.Value} TestId:{row.TestId}" );
//Output
/*
Id:1 Value:A TestId:Test1
Id:2 Value:A TestId:Test2
Id:2 Value:B TestId:Test3
Id:3 Value:B TestId:Test6
Duplicate - Id:1 Value:A TestId:Test4
Duplicate - Id:2 Value:A TestId:Test5
*/
Using Distinct with Dynamic Objects# Like all ETLBox data flow components, the Distinct component also facilitates the use of dynamic ExpandoObject
.
var source = new MemorySource ();
source . DataAsList . Add ( CreateDynamicRow ( distinctCol1 : 1 , distinctCol2 : "A" , otherValue : "1" ));
source . DataAsList . Add ( CreateDynamicRow ( distinctCol1 : 2 , distinctCol2 : "B" , otherValue : "5" ));
source . DataAsList . Add ( CreateDynamicRow ( distinctCol1 : 1 , distinctCol2 : "C" , otherValue : "2" ));
source . DataAsList . Add ( CreateDynamicRow ( distinctCol1 : 1 , distinctCol2 : "A" , otherValue : "3" ));
source . DataAsList . Add ( CreateDynamicRow ( distinctCol1 : 1 , distinctCol2 : "C" , otherValue : "4" ));
source . DataAsList . Add ( CreateDynamicRow ( distinctCol1 : 2 , distinctCol2 : "B" , otherValue : "5" ));
source . DataAsList . Add ( CreateDynamicRow ( distinctCol1 : 2 , distinctCol2 : "B" , otherValue : "6" ));
var trans = new Distinct ();
trans . DistinctColumns = new DistinctColumn []{
new DistinctColumn () { DistinctPropertyName = "DistinctCol1" },
new DistinctColumn () { DistinctPropertyName = "DistinctCol2" }
};
var dest = new MemoryDestination ();
var destDuplicates1 = new MemoryDestination ();
var destDuplicates2 = new MemoryDestination ();
source . LinkTo ( trans );
trans . LinkTo ( dest );
trans . LinkDuplicatesTo ( destDuplicates1 , row => ( row as dynamic ). DistinctCol1 == 1 );
trans . LinkDuplicatesTo ( destDuplicates2 , row => ( row as dynamic ). DistinctCol1 == 2 );
Network . Execute ( source );
foreach ( dynamic row in dest . Data )
Console . WriteLine ( $"DistinctCol1:{row.DistinctCol1} DistinctCol2:{row.DistinctCol2} OtherValue:{row.OtherValue}" );
foreach ( dynamic row in destDuplicates1 . Data )
Console . WriteLine ( $"Duplicate 1 - DistinctCol1:{row.DistinctCol1} DistinctCol2:{row.DistinctCol2} OtherValue:{row.OtherValue}" );
foreach ( dynamic row in destDuplicates2 . Data )
Console . WriteLine ( $"Duplicate 2 - DistinctCol1:{row.DistinctCol1} DistinctCol2:{row.DistinctCol2} OtherValue:{row.OtherValue}" );
//Output
/*
DistinctCol1:1 DistinctCol2:A OtherValue:1
DistinctCol1:2 DistinctCol2:B OtherValue:5
DistinctCol1:1 DistinctCol2:C OtherValue:2
Duplicate 1 - DistinctCol1:1 DistinctCol2:A OtherValue:3
Duplicate 1 - DistinctCol1:1 DistinctCol2:C OtherValue:4
Duplicate 2 - DistinctCol1:2 DistinctCol2:B OtherValue:5
Duplicate 2 - DistinctCol1:2 DistinctCol2:B OtherValue:6
*/
public ExpandoObject CreateDynamicRow ( int distinctCol1 , string distinctCol2 , string otherValue ) {
dynamic r = new ExpandoObject ();
r . DistinctCol1 = distinctCol1 ;
r . DistinctCol2 = distinctCol2 ;
r . OtherValue = otherValue ;
return r ;
}
Custom Unique Key Function# Use the GetUniqueKeyFunc
property to define a custom logic for creating a unique key for each row.
var source = new MemorySource < MyRow >();
source . DataAsList . Add ( new MyRow () { Id = 1 , Value = "A" , TestId = "Test1" });
source . DataAsList . Add ( new MyRow () { Id = 1 , Value = "B" , TestId = "Test2" });
source . DataAsList . Add ( new MyRow () { Id = 1 , Value = "A_dupe" , TestId = "Test3" });
source . DataAsList . Add ( new MyRow () { Id = 1 , Value = "b_dupe" , TestId = "Test4" });
source . DataAsList . Add ( new MyRow () { Id = 1 , Value = "c" , TestId = "Test5" });
source . DataAsList . Add ( new MyRow () { Id = 1 , Value = "a" , TestId = "Test6" });
var trans = new Distinct < MyRow >();
trans . GetUniqueKeyFunc = ( row ) => row . Value . Substring ( 0 , 1 ). ToLower ();
var dest = new MemoryDestination < MyRow >();
var destDuplicates = new MemoryDestination < MyRow >();
source . LinkTo ( trans );
trans . LinkTo ( dest );
trans . LinkDuplicatesTo ( destDuplicates );
Network . Execute ( source );
foreach ( var row in dest . Data )
Console . WriteLine ( $"Id:{row.Id} Value:{row.Value} TestId:{row.TestId}" );
foreach ( var row in destDuplicates . Data )
Console . WriteLine ( $"Duplicate - Id:{row.Id} Value:{row.Value} TestId:{row.TestId}" );
//Output
/*
Id:1 Value:A TestId:Test1
Id:1 Value:B TestId:Test2
Id:1 Value:c TestId:Test5
Duplicate - Id:1 Value:A_dupe TestId:Test3
Duplicate - Id:1 Value:b_dupe TestId:Test4
Duplicate - Id:1 Value:a TestId:Test6
*/