@@ -168,82 +168,72 @@ public DataFrame Merge<TKey>(DataFrame other, string leftJoinColumn, string righ
168
168
{
169
169
// First hash other dataframe on the rightJoinColumn
170
170
DataFrameColumn otherColumn = other . Columns [ rightJoinColumn ] ;
171
- Dictionary < TKey , ICollection < long > > multimap = otherColumn . GroupColumnValues < TKey > ( ) ;
171
+ Dictionary < TKey , ICollection < long > > multimap = otherColumn . GroupColumnValues < TKey > ( out HashSet < long > otherColumnNullIndices ) ;
172
172
173
173
// Go over the records in this dataframe and match with the dictionary
174
174
DataFrameColumn thisColumn = Columns [ leftJoinColumn ] ;
175
175
176
176
for ( long i = 0 ; i < thisColumn . Length ; i ++ )
177
177
{
178
178
var thisColumnValue = thisColumn [ i ] ;
179
- TKey thisColumnValueOrDefault = ( TKey ) ( thisColumnValue == null ? default ( TKey ) : thisColumnValue ) ;
180
- if ( multimap . TryGetValue ( thisColumnValueOrDefault , out ICollection < long > rowNumbers ) )
179
+ if ( thisColumnValue != null )
181
180
{
182
- foreach ( long row in rowNumbers )
181
+ if ( multimap . TryGetValue ( ( TKey ) thisColumnValue , out ICollection < long > rowNumbers ) )
183
182
{
184
- if ( thisColumnValue == null )
183
+ foreach ( long row in rowNumbers )
185
184
{
186
- // Match only with nulls in otherColumn
187
- if ( otherColumn [ row ] == null )
188
- {
189
- leftRowIndices . Append ( i ) ;
190
- rightRowIndices . Append ( row ) ;
191
- }
192
- }
193
- else
194
- {
195
- // Cannot match nulls in otherColumn
196
- if ( otherColumn [ row ] != null )
197
- {
198
- leftRowIndices . Append ( i ) ;
199
- rightRowIndices . Append ( row ) ;
200
- }
185
+ leftRowIndices . Append ( i ) ;
186
+ rightRowIndices . Append ( row ) ;
201
187
}
202
188
}
189
+ else
190
+ {
191
+ leftRowIndices . Append ( i ) ;
192
+ rightRowIndices . Append ( null ) ;
193
+ }
203
194
}
204
195
else
205
196
{
206
- leftRowIndices . Append ( i ) ;
207
- rightRowIndices . Append ( null ) ;
197
+ foreach ( long row in otherColumnNullIndices )
198
+ {
199
+ leftRowIndices . Append ( i ) ;
200
+ rightRowIndices . Append ( row ) ;
201
+ }
208
202
}
209
203
}
210
204
}
211
205
else if ( joinAlgorithm == JoinAlgorithm . Right )
212
206
{
213
207
DataFrameColumn thisColumn = Columns [ leftJoinColumn ] ;
214
- Dictionary < TKey , ICollection < long > > multimap = thisColumn . GroupColumnValues < TKey > ( ) ;
208
+ Dictionary < TKey , ICollection < long > > multimap = thisColumn . GroupColumnValues < TKey > ( out HashSet < long > thisColumnNullIndices ) ;
215
209
216
210
DataFrameColumn otherColumn = other . Columns [ rightJoinColumn ] ;
217
211
for ( long i = 0 ; i < otherColumn . Length ; i ++ )
218
212
{
219
213
var otherColumnValue = otherColumn [ i ] ;
220
- TKey otherColumnValueOrDefault = ( TKey ) ( otherColumnValue == null ? default ( TKey ) : otherColumnValue ) ;
221
- if ( multimap . TryGetValue ( otherColumnValueOrDefault , out ICollection < long > rowNumbers ) )
214
+ if ( otherColumnValue != null )
222
215
{
223
- foreach ( long row in rowNumbers )
216
+ if ( multimap . TryGetValue ( ( TKey ) otherColumnValue , out ICollection < long > rowNumbers ) )
224
217
{
225
- if ( otherColumnValue == null )
218
+ foreach ( long row in rowNumbers )
226
219
{
227
- if ( thisColumn [ row ] == null )
228
- {
229
- leftRowIndices . Append ( row ) ;
230
- rightRowIndices . Append ( i ) ;
231
- }
232
- }
233
- else
234
- {
235
- if ( thisColumn [ row ] != null )
236
- {
237
- leftRowIndices . Append ( row ) ;
238
- rightRowIndices . Append ( i ) ;
239
- }
220
+ leftRowIndices . Append ( row ) ;
221
+ rightRowIndices . Append ( i ) ;
240
222
}
241
223
}
224
+ else
225
+ {
226
+ leftRowIndices . Append ( null ) ;
227
+ rightRowIndices . Append ( i ) ;
228
+ }
242
229
}
243
230
else
244
231
{
245
- leftRowIndices . Append ( null ) ;
246
- rightRowIndices . Append ( i ) ;
232
+ foreach ( long thisColumnNullIndex in thisColumnNullIndices )
233
+ {
234
+ leftRowIndices . Append ( thisColumnNullIndex ) ;
235
+ rightRowIndices . Append ( i ) ;
236
+ }
247
237
}
248
238
}
249
239
}
@@ -253,97 +243,106 @@ public DataFrame Merge<TKey>(DataFrame other, string leftJoinColumn, string righ
253
243
long leftRowCount = Rows . Count ;
254
244
long rightRowCount = other . Rows . Count ;
255
245
256
- var leftColumnIsSmaller = ( leftRowCount <= rightRowCount ) ;
246
+ bool leftColumnIsSmaller = leftRowCount <= rightRowCount ;
257
247
DataFrameColumn hashColumn = leftColumnIsSmaller ? Columns [ leftJoinColumn ] : other . Columns [ rightJoinColumn ] ;
258
248
DataFrameColumn otherColumn = ReferenceEquals ( hashColumn , Columns [ leftJoinColumn ] ) ? other . Columns [ rightJoinColumn ] : Columns [ leftJoinColumn ] ;
259
- Dictionary < TKey , ICollection < long > > multimap = hashColumn . GroupColumnValues < TKey > ( ) ;
249
+ Dictionary < TKey , ICollection < long > > multimap = hashColumn . GroupColumnValues < TKey > ( out HashSet < long > smallerDataFrameColumnNullIndices ) ;
260
250
261
251
for ( long i = 0 ; i < otherColumn . Length ; i ++ )
262
252
{
263
253
var otherColumnValue = otherColumn [ i ] ;
264
- TKey otherColumnValueOrDefault = ( TKey ) ( otherColumnValue == null ? default ( TKey ) : otherColumnValue ) ;
265
- if ( multimap . TryGetValue ( otherColumnValueOrDefault , out ICollection < long > rowNumbers ) )
254
+ if ( otherColumnValue != null )
266
255
{
267
- foreach ( long row in rowNumbers )
256
+ if ( multimap . TryGetValue ( ( TKey ) otherColumnValue , out ICollection < long > rowNumbers ) )
268
257
{
269
- if ( otherColumnValue == null )
270
- {
271
- if ( hashColumn [ row ] == null )
272
- {
273
- leftRowIndices . Append ( leftColumnIsSmaller ? row : i ) ;
274
- rightRowIndices . Append ( leftColumnIsSmaller ? i : row ) ;
275
- }
276
- }
277
- else
258
+ foreach ( long row in rowNumbers )
278
259
{
279
- if ( hashColumn [ row ] != null )
280
- {
281
- leftRowIndices . Append ( leftColumnIsSmaller ? row : i ) ;
282
- rightRowIndices . Append ( leftColumnIsSmaller ? i : row ) ;
283
- }
260
+ leftRowIndices . Append ( leftColumnIsSmaller ? row : i ) ;
261
+ rightRowIndices . Append ( leftColumnIsSmaller ? i : row ) ;
284
262
}
285
263
}
286
264
}
265
+ else
266
+ {
267
+ foreach ( long nullIndex in smallerDataFrameColumnNullIndices )
268
+ {
269
+ leftRowIndices . Append ( leftColumnIsSmaller ? nullIndex : i ) ;
270
+ rightRowIndices . Append ( leftColumnIsSmaller ? i : nullIndex ) ;
271
+ }
272
+ }
287
273
}
288
274
}
289
275
else if ( joinAlgorithm == JoinAlgorithm . FullOuter )
290
276
{
291
277
DataFrameColumn otherColumn = other . Columns [ rightJoinColumn ] ;
292
- Dictionary < TKey , ICollection < long > > multimap = otherColumn . GroupColumnValues < TKey > ( ) ;
278
+ Dictionary < TKey , ICollection < long > > multimap = otherColumn . GroupColumnValues < TKey > ( out HashSet < long > otherColumnNullIndices ) ;
293
279
Dictionary < TKey , long > intersection = new Dictionary < TKey , long > ( EqualityComparer < TKey > . Default ) ;
294
280
295
281
// Go over the records in this dataframe and match with the dictionary
296
282
DataFrameColumn thisColumn = Columns [ leftJoinColumn ] ;
283
+ Int64DataFrameColumn thisColumnNullIndices = new Int64DataFrameColumn ( "ThisColumnNullIndices" ) ;
297
284
298
285
for ( long i = 0 ; i < thisColumn . Length ; i ++ )
299
286
{
300
287
var thisColumnValue = thisColumn [ i ] ;
301
- TKey thisColumnValueOrDefault = ( TKey ) ( thisColumnValue == null ? default ( TKey ) : thisColumnValue ) ;
302
- if ( multimap . TryGetValue ( thisColumnValueOrDefault , out ICollection < long > rowNumbers ) )
288
+ if ( thisColumnValue != null )
303
289
{
304
- foreach ( long row in rowNumbers )
290
+ if ( multimap . TryGetValue ( ( TKey ) thisColumnValue , out ICollection < long > rowNumbers ) )
305
291
{
306
- if ( thisColumnValue == null )
307
- {
308
- // Has to match only with nulls in otherColumn
309
- if ( otherColumn [ row ] == null )
310
- {
311
- leftRowIndices . Append ( i ) ;
312
- rightRowIndices . Append ( row ) ;
313
- if ( ! intersection . ContainsKey ( thisColumnValueOrDefault ) )
314
- {
315
- intersection . Add ( thisColumnValueOrDefault , rowNumber ) ;
316
- }
317
- }
318
- }
319
- else
292
+ foreach ( long row in rowNumbers )
320
293
{
321
- // Cannot match to nulls in otherColumn
322
- if ( otherColumn [ row ] != null )
294
+ leftRowIndices . Append ( i ) ;
295
+ rightRowIndices . Append ( row ) ;
296
+ if ( ! intersection . ContainsKey ( ( TKey ) thisColumnValue ) )
323
297
{
324
- leftRowIndices . Append ( i ) ;
325
- rightRowIndices . Append ( row ) ;
326
- if ( ! intersection . ContainsKey ( thisColumnValueOrDefault ) )
327
- {
328
- intersection . Add ( thisColumnValueOrDefault , rowNumber ) ;
329
- }
298
+ intersection . Add ( ( TKey ) thisColumnValue , rowNumber ) ;
330
299
}
331
300
}
332
301
}
302
+ else
303
+ {
304
+ leftRowIndices . Append ( i ) ;
305
+ rightRowIndices . Append ( null ) ;
306
+ }
333
307
}
334
308
else
335
309
{
336
- leftRowIndices . Append ( i ) ;
337
- rightRowIndices . Append ( null ) ;
310
+ thisColumnNullIndices . Append ( i ) ;
338
311
}
339
312
}
340
313
for ( long i = 0 ; i < otherColumn . Length ; i ++ )
341
314
{
342
- TKey value = ( TKey ) ( otherColumn [ i ] ?? default ( TKey ) ) ;
343
- if ( ! intersection . ContainsKey ( value ) )
315
+ var value = otherColumn [ i ] ;
316
+ if ( value != null )
317
+ {
318
+ if ( ! intersection . ContainsKey ( ( TKey ) value ) )
319
+ {
320
+ leftRowIndices . Append ( null ) ;
321
+ rightRowIndices . Append ( i ) ;
322
+ }
323
+ }
324
+ }
325
+
326
+ // Now handle the null rows
327
+ foreach ( long ? thisColumnNullIndex in thisColumnNullIndices )
328
+ {
329
+ foreach ( long otherColumnNullIndex in otherColumnNullIndices )
330
+ {
331
+ leftRowIndices . Append ( thisColumnNullIndex . Value ) ;
332
+ rightRowIndices . Append ( otherColumnNullIndex ) ;
333
+ }
334
+ if ( otherColumnNullIndices . Count == 0 )
335
+ {
336
+ leftRowIndices . Append ( thisColumnNullIndex . Value ) ;
337
+ rightRowIndices . Append ( null ) ;
338
+ }
339
+ }
340
+ if ( thisColumnNullIndices . Length == 0 )
341
+ {
342
+ foreach ( long otherColumnNullIndex in otherColumnNullIndices )
344
343
{
345
344
leftRowIndices . Append ( null ) ;
346
- rightRowIndices . Append ( i ) ;
345
+ rightRowIndices . Append ( otherColumnNullIndex ) ;
347
346
}
348
347
}
349
348
}
0 commit comments