From 4fe09f855df4ed656aaadbac92019621db02157a Mon Sep 17 00:00:00 2001 From: Florian Sihler Date: Fri, 14 Mar 2025 15:03:46 +0100 Subject: [PATCH 01/11] Hooks for AbsInt as Decorators (#1418) * feat: basic concept for data frame create hook * feat: data frame hook for string based column access * feat: map variables to data frame domain at assignment * feat: data frame semantics for assignment and expression list * refactor: structure abstract interpretation hooks * feat: basic tests for data frame abstract domain * feat: test framework for inferred data frame properties * test: add tests for data frame domain * refactor: restructure processor decorators and function mapping * lint-fix: linting errors --------- Co-authored-by: Oliver --- .../data-frame/absint-info.ts | 36 ++++ .../data-frame/domain.ts | 87 +++++++++ .../data-frame/process/data-frame-access.ts | 39 ++++ .../process/data-frame-assignment.ts | 22 +++ .../process/data-frame-expression-list.ts | 39 ++++ .../process/data-frame-function-call.ts | 64 +++++++ .../data-frame/resolve-args.ts | 44 +++++ .../data-frame/semantics.ts | 84 +++++++++ src/dataflow/environments/built-in-config.ts | 2 +- src/dataflow/environments/built-in.ts | 47 ++++- src/dataflow/environments/resolve-by-name.ts | 40 ++++ .../data-frame/data-frame.ts | 173 ++++++++++++++++++ .../data-frame/domain.test.ts | 80 ++++++++ .../data-frame/inference.test.ts | 120 ++++++++++++ 14 files changed, 871 insertions(+), 6 deletions(-) create mode 100644 src/abstract-interpretation/data-frame/absint-info.ts create mode 100644 src/abstract-interpretation/data-frame/domain.ts create mode 100644 src/abstract-interpretation/data-frame/process/data-frame-access.ts create mode 100644 src/abstract-interpretation/data-frame/process/data-frame-assignment.ts create mode 100644 src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts create mode 100644 src/abstract-interpretation/data-frame/process/data-frame-function-call.ts create mode 100644 src/abstract-interpretation/data-frame/resolve-args.ts create mode 100644 src/abstract-interpretation/data-frame/semantics.ts create mode 100644 test/functionality/abstract-interpretation/data-frame/data-frame.ts create mode 100644 test/functionality/abstract-interpretation/data-frame/domain.test.ts create mode 100644 test/functionality/abstract-interpretation/data-frame/inference.test.ts diff --git a/src/abstract-interpretation/data-frame/absint-info.ts b/src/abstract-interpretation/data-frame/absint-info.ts new file mode 100644 index 0000000000..0df1554cae --- /dev/null +++ b/src/abstract-interpretation/data-frame/absint-info.ts @@ -0,0 +1,36 @@ +import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; +import type { DataFrameDomain } from './domain'; +import type { DataFrameOperationName } from './semantics'; + +export interface DataFrameOperation { + operation: DataFrameOperationName, + operand: NodeId | undefined, + arguments: (NodeId | undefined)[] +} + +interface DataFrameStatementInfo { + type: 'statement', + domain: Map +} + +interface DataFrameAssignmentInfo { + type: 'assignment', + identifier: NodeId, + expression: NodeId +} + +interface DataFrameExpressionInfo { + type: 'expression', + operations: DataFrameOperation[] +} + +interface DataFrameSymbolInfo { + type: 'symbol', + value: DataFrameDomain +} + +type DataFrameInfo = DataFrameStatementInfo | DataFrameAssignmentInfo | DataFrameExpressionInfo | DataFrameSymbolInfo; + +export interface AbstractInterpretationInfo { + dataFrame?: DataFrameInfo +} diff --git a/src/abstract-interpretation/data-frame/domain.ts b/src/abstract-interpretation/data-frame/domain.ts new file mode 100644 index 0000000000..264710f2d4 --- /dev/null +++ b/src/abstract-interpretation/data-frame/domain.ts @@ -0,0 +1,87 @@ +type Interval = [number, number]; + +export const IntervalBottom = 'bottom'; +export const IntervalTop: Interval = [0, Infinity]; +export type IntervalDomain = Interval | typeof IntervalBottom; + +export const ColNamesBottom: string[] = []; +export const ColNamesTop = 'top'; +export type ColNamesDomain = string[] | typeof ColNamesTop; + +export interface DataFrameDomain { + colnames: ColNamesDomain, + cols: IntervalDomain, + rows: IntervalDomain +} + +export const DataFrameBottom: DataFrameDomain = { + colnames: ColNamesBottom, + cols: IntervalBottom, + rows: IntervalBottom +}; + +export const DataFrameTop: DataFrameDomain = { + colnames: ColNamesTop, + cols: IntervalTop, + rows: IntervalTop +}; + +export function leqColNames(X1: ColNamesDomain, X2: ColNamesDomain): boolean { + return X2 === ColNamesTop || (X1 !== ColNamesTop && new Set(X1).isSubsetOf(new Set(X2))); +} + +export function joinColNames(X1: ColNamesDomain, X2: ColNamesDomain): ColNamesDomain { + if(X1 === ColNamesTop || X2 === ColNamesTop) { + return ColNamesTop; + } else { + return Array.from(new Set(X1).union(new Set(X2))); + } +} + +export function meetColNames(X1: ColNamesDomain, X2: ColNamesDomain): ColNamesDomain { + if(X1 === ColNamesTop && X2 === ColNamesTop) { + return ColNamesTop; + } else if(X1 === ColNamesTop) { + return X2; + } else if(X2 === ColNamesTop) { + return X1; + } else { + return Array.from(new Set(X1).intersection(new Set(X2))); + } +} + +export function subtractColNames(X1: ColNamesDomain, X2: ColNamesDomain): ColNamesDomain { + if(X2 === ColNamesTop) { + return ColNamesBottom; + } else if(X1 === ColNamesTop) { + return ColNamesTop; + } else { + return Array.from(new Set(X1).difference(new Set(X2))); + } +} + +export function leqInterval(X1: IntervalDomain, X2: IntervalDomain): boolean { + return X1 === IntervalBottom || (X2 !== IntervalBottom && X2[0] <= X1[0] && X1[1] <= X2[1]); +} + +export function joinInterval(X1: IntervalDomain, X2: IntervalDomain): IntervalDomain { + if(X1 === IntervalBottom && X2 === IntervalBottom) { + return IntervalBottom; + } else if(X1 === IntervalBottom) { + return X2; + } else if(X2 === IntervalBottom) { + return X1; + } else { + return [Math.min(X1[0], X2[0]), Math.max(X1[1], X2[1])]; + } +} + +export function meetInterval(X1: IntervalDomain, X2: IntervalDomain): IntervalDomain { + if(X1 === IntervalBottom || X2 === IntervalBottom) { + return IntervalBottom; + } else if(Math.max(X1[0], X2[0]) > Math.min(X1[1], X2[1])) { + return IntervalBottom; + } else { + return [Math.max(X1[0], X2[0]), Math.min(X1[1], X2[1])]; + } +} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-access.ts b/src/abstract-interpretation/data-frame/process/data-frame-access.ts new file mode 100644 index 0000000000..732972282f --- /dev/null +++ b/src/abstract-interpretation/data-frame/process/data-frame-access.ts @@ -0,0 +1,39 @@ +import type { ForceArguments } from '../../../dataflow/internal/process/functions/call/common'; +import type { DataflowProcessorInformation } from '../../../dataflow/processor'; +import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; +import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id'; +import type { AbstractInterpretationInfo } from '../absint-info'; + +export function processDataFrameAccess( + name: RSymbol, + args: readonly RFunctionArgument[], + rootId: NodeId, + data: DataflowProcessorInformation, + config: { treatIndicesAsString: boolean } & ForceArguments +) { + if(config.treatIndicesAsString) { + processDataFrameStringBasedAccess(name, args); + } +} + +function processDataFrameStringBasedAccess( + name: RSymbol, + args: readonly RFunctionArgument[] +) { + const leftArg = args[0] !== EmptyArgument ? args[0] : undefined; + const rightArg = args[1] !== EmptyArgument ? args[1]: undefined; + + if(args.length === 2 && leftArg !== undefined && rightArg !== undefined) { + name.info.dataFrame = { + type: 'expression', + operations: [{ + operation: 'accessCol', + operand: leftArg.info.id, + arguments: [rightArg.info.id] + }] + }; + } +} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-assignment.ts b/src/abstract-interpretation/data-frame/process/data-frame-assignment.ts new file mode 100644 index 0000000000..ebb2f82a79 --- /dev/null +++ b/src/abstract-interpretation/data-frame/process/data-frame-assignment.ts @@ -0,0 +1,22 @@ +import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; +import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; +import type { AbstractInterpretationInfo } from '../absint-info'; + +export function processDataFrameAssignment( + name: RSymbol, + args: readonly RFunctionArgument[] +) { + const leftArg = args[0] !== EmptyArgument ? args[0] : undefined; + const rightArg = args[1] !== EmptyArgument ? args[1] : undefined; + + if(args.length === 2 && leftArg?.value?.type === RType.Symbol && rightArg?.value !== undefined) { + name.info.dataFrame = { + type: 'assignment', + identifier: leftArg.value.info.id, + expression: rightArg.value.info.id + }; + } +} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts b/src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts new file mode 100644 index 0000000000..2172c51afb --- /dev/null +++ b/src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts @@ -0,0 +1,39 @@ +import type { DataflowProcessorInformation } from '../../../dataflow/processor'; +import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; +import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id'; +import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; +import type { DataFrameDomain } from '../domain'; +import type { AbstractInterpretationInfo } from '../absint-info'; +import { applyExpressionSemantics } from '../semantics'; + +export function processDataFrameExpressionList( + name: RSymbol, + args: readonly RFunctionArgument[], + rootId: NodeId, + data: DataflowProcessorInformation +) { + const domain: Map = new Map(); + + for(const arg of args) { + if(arg !== EmptyArgument && arg.value?.info.dataFrame?.type === 'assignment') { + const resolveInfo = { environment: data.environment, idMap: data.completeAst.idMap, full: true }; + const identifier = resolveInfo.idMap.get(arg.value.info.dataFrame.identifier); + const expression = resolveInfo.idMap.get(arg.value.info.dataFrame.expression); + + if(identifier?.type === RType.Symbol && expression !== undefined) { + const dataFrameDomain = applyExpressionSemantics(expression, domain, resolveInfo); + + if(dataFrameDomain !== undefined) { + domain.set(identifier.info.id, dataFrameDomain); + identifier.info.dataFrame = { + type: 'symbol', + value: dataFrameDomain + }; + } + } + } + } +} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-function-call.ts b/src/abstract-interpretation/data-frame/process/data-frame-function-call.ts new file mode 100644 index 0000000000..6cc8447d88 --- /dev/null +++ b/src/abstract-interpretation/data-frame/process/data-frame-function-call.ts @@ -0,0 +1,64 @@ +import type { BuiltInIdentifierProcessorDecorator } from '../../../dataflow/environments/built-in'; +import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; +import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { AbstractInterpretationInfo } from '../absint-info'; + +const DataFrameFunctionMapper = { + 'data.frame': processDataFrameCreate, + 'as.data.frame': processDataFrameUnknownCreate, + 'read.csv': processDataFrameUnknownCreate, + 'read.table': processDataFrameUnknownCreate +} as const satisfies Record>; + +const DataFrameSpecialArgumentsMapper = { + 'data.frame': ['row.names', 'check.rows', 'check.names', 'fix.empty.names', 'stringsAsFactors'] as string[] +} as const satisfies Partial>; + +type DataFrameFunction = keyof typeof DataFrameFunctionMapper; + +export function processDataFrameFunctionCall( + name: RSymbol, + args: readonly RFunctionArgument[] +) { + if(name.content in DataFrameFunctionMapper) { + DataFrameFunctionMapper[name.content as DataFrameFunction]?.(name, args); + } +} + +function processDataFrameCreate( + name: RSymbol, + args: readonly RFunctionArgument[] +) { + name.info.dataFrame = { + type: 'expression', + operations: [{ + operation: 'create', + operand: undefined, + arguments: args + .filter(arg => !isSpecialArgument('data.frame', arg)) + .map(arg => arg !== EmptyArgument ? arg.info.id : undefined) + }] + }; +} + +function processDataFrameUnknownCreate( + name: RSymbol +) { + name.info.dataFrame = { + type: 'expression', + operations: [{ + operation: 'unknown', + operand: undefined, + arguments: [] + }] + }; +} + +function isSpecialArgument(funct: keyof typeof DataFrameSpecialArgumentsMapper, argument: RFunctionArgument) { + if(argument === EmptyArgument || argument.name === undefined) { + return false; + } + return DataFrameSpecialArgumentsMapper[funct].includes(argument.name.content); +} diff --git a/src/abstract-interpretation/data-frame/resolve-args.ts b/src/abstract-interpretation/data-frame/resolve-args.ts new file mode 100644 index 0000000000..4d99ba0a2d --- /dev/null +++ b/src/abstract-interpretation/data-frame/resolve-args.ts @@ -0,0 +1,44 @@ +import type { ResolveInfo } from '../../dataflow/environments/resolve-by-name'; +import { resolveIdToValue } from '../../dataflow/environments/resolve-by-name'; +import type { RNodeWithParent } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; +import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; + +export function resolveIdToArgName(id: NodeId | RNodeWithParent, { graph, idMap } : ResolveInfo): string | undefined { + idMap ??= graph?.idMap; + const node = typeof id === 'object' ? id : idMap?.get(id); + + if(node?.type === RType.Argument) { + return node.name?.content; + } + return undefined; +} + +export function resolveIdToArgValueSymbolName(id: NodeId | RNodeWithParent, { graph, idMap } : ResolveInfo): string | undefined { + idMap ??= graph?.idMap; + const node = typeof id === 'object' ? id : idMap?.get(id); + + if(node?.type === RType.Argument && node.value !== undefined) { + if(node.value.type === RType.Symbol) { + return node.value.content; + } else if(node.value.type === RType.String) { + return node.value.content.str; + } + } + return undefined; +} + +export function resolveIdToArgVectorLength(id: NodeId | RNodeWithParent, { graph, idMap, ...resolveInfo } : ResolveInfo): number | undefined { + idMap ??= graph?.idMap; + const node = typeof id === 'object' ? id : idMap?.get(id); + + if(node?.type !== RType.Argument || node.value === undefined) { + return undefined; + } + const resolvedValue = resolveIdToValue(node.value, { graph, idMap, ...resolveInfo }); + + if(resolvedValue?.length === 1) { + return Array.isArray(resolvedValue[0]) ? resolvedValue[0].length : undefined; + } + return undefined; +} diff --git a/src/abstract-interpretation/data-frame/semantics.ts b/src/abstract-interpretation/data-frame/semantics.ts new file mode 100644 index 0000000000..6af3654624 --- /dev/null +++ b/src/abstract-interpretation/data-frame/semantics.ts @@ -0,0 +1,84 @@ +import type { ResolveInfo } from '../../dataflow/environments/resolve-by-name'; +import { resolveByName } from '../../dataflow/environments/resolve-by-name'; +import type { RNode } from '../../r-bridge/lang-4.x/ast/model/model'; +import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; +import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; +import type { DataFrameDomain, ColNamesDomain } from './domain'; +import { DataFrameTop, ColNamesTop, IntervalTop, ColNamesBottom, joinColNames } from './domain'; +import type { AbstractInterpretationInfo, DataFrameOperation } from './absint-info'; +import { resolveIdToArgName, resolveIdToArgVectorLength, resolveIdToArgValueSymbolName } from './resolve-args'; + +const DataFrameSemanticsMapper = { + 'create': applyCreateSemantics, + 'accessCol': applyAccessColSemantics, + 'unknown': applyUnknownSemantics +} as const satisfies Record; + +type DataFrameSemanticsApplier = (value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo) => DataFrameDomain; +export type DataFrameOperationName = keyof typeof DataFrameSemanticsMapper; + +export function applyExpressionSemantics( + node: RNode, + domain: Map, + resolveInfo : ResolveInfo +): DataFrameDomain | undefined { + if(node.type === RType.FunctionCall && node.named && node.functionName.info.dataFrame?.type === 'expression') { + let dataFrameDomain: DataFrameDomain = DataFrameTop; + + for(const operation of node.functionName.info.dataFrame.operations) { + if(operation.operand === undefined) { + const semanticsApplier = DataFrameSemanticsMapper[operation.operation]; + dataFrameDomain = semanticsApplier(dataFrameDomain, operation, resolveInfo); + } else { + const operand = resolveInfo.idMap?.get(operation.operand); + const operandDomain = operand ? applyExpressionSemantics(operand, domain, resolveInfo) ?? DataFrameTop : DataFrameTop; + const semanticsApplier = DataFrameSemanticsMapper[operation.operation]; + dataFrameDomain = semanticsApplier(operandDomain, operation, resolveInfo); + } + } + return dataFrameDomain; + } else if(node.type === RType.Symbol && resolveInfo.environment !== undefined) { + const identifiers = resolveByName(node.content, resolveInfo.environment); + + if(identifiers?.length === 1) { + const dataFrameDomain = domain.get(identifiers[0].nodeId); + + if(dataFrameDomain !== undefined) { + node.info.dataFrame = { + type: 'symbol', + value: dataFrameDomain + }; + } + return dataFrameDomain; + } + } + return undefined; +} + +function applyCreateSemantics(value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo): DataFrameDomain { + const argNames = event.arguments.map(arg => arg ? resolveIdToArgName(arg, info) : undefined); + const argLengths = event.arguments.map(arg => arg ? resolveIdToArgVectorLength(arg, info) : undefined); + const colnames = argNames.some(arg => arg === undefined) ? ColNamesTop : argNames as ColNamesDomain; + const rowCount = argLengths.some(arg => arg === undefined) ? undefined : Math.max(...argLengths as number[], 0); + + return { + colnames: colnames, + cols: [event.arguments.length, event.arguments.length], + rows: rowCount !== undefined ? [rowCount, rowCount] : IntervalTop + }; +} + +function applyAccessColSemantics(value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo): DataFrameDomain { + const argNames = event.arguments.map(arg => arg ? resolveIdToArgValueSymbolName(arg, info) : undefined); + const colnames = argNames.some(arg => arg === undefined) ? ColNamesBottom : argNames as ColNamesDomain; + + return { + ...value, + colnames: joinColNames(value.colnames, colnames) + }; +} + +function applyUnknownSemantics(): DataFrameDomain { + return DataFrameTop; +} diff --git a/src/dataflow/environments/built-in-config.ts b/src/dataflow/environments/built-in-config.ts index 2c2a3d8aec..2964e9ed5a 100644 --- a/src/dataflow/environments/built-in-config.ts +++ b/src/dataflow/environments/built-in-config.ts @@ -80,7 +80,7 @@ export function registerBuiltInFunctions mappedProcessor(name, args, rootId, data, config as any), config, name, diff --git a/src/dataflow/environments/built-in.ts b/src/dataflow/environments/built-in.ts index 6a77535c63..8de00d2d7f 100644 --- a/src/dataflow/environments/built-in.ts +++ b/src/dataflow/environments/built-in.ts @@ -30,6 +30,10 @@ import { registerBuiltInDefinitions } from './built-in-config'; import { DefaultBuiltinConfig } from './default-builtin-config'; import type { LinkTo } from '../../queries/catalog/call-context-query/call-context-query-format'; import { processList } from '../internal/process/functions/call/built-in/built-in-list'; +import { processDataFrameAccess } from '../../abstract-interpretation/data-frame/process/data-frame-access'; +import { processDataFrameAssignment } from '../../abstract-interpretation/data-frame/process/data-frame-assignment'; +import { processDataFrameExpressionList } from '../../abstract-interpretation/data-frame/process/data-frame-expression-list'; +import { processDataFrameFunctionCall } from '../../abstract-interpretation/data-frame/process/data-frame-function-call'; import { processVector } from '../internal/process/functions/call/built-in/built-in-vector'; import { processRm } from '../internal/process/functions/call/built-in/built-in-rm'; @@ -50,6 +54,14 @@ export type BuiltInIdentifierProcessorWithConfig = ( config: Config ) => DataflowInformation +export type BuiltInIdentifierProcessorDecorator = ( + name: RSymbol, + args: readonly RFunctionArgument[], + rootId: NodeId, + data: DataflowProcessorInformation, + config: Config +) => void + export interface BuiltInIdentifierDefinition extends IdentifierReference { type: ReferenceType.BuiltInFunction definedAt: typeof BuiltIn @@ -131,17 +143,32 @@ export function registerBuiltInFunctions( + processor: BuiltInIdentifierProcessorWithConfig, + ...decorators: BuiltInProcessorDecoratorName[] +): BuiltInIdentifierProcessorWithConfig { + return (name, args, rootId, data, config) => { + const result = processor(name, args, rootId, data, config); + decorators + .map(name => BuiltInProcessorDecoratorMapper[name] as BuiltInIdentifierProcessorDecorator) + .forEach(decorator => decorator(name, args, rootId, { ...data, environment: result.environment }, config)); + + return result; + }; +} + export const BuiltInProcessorMapper = { - 'builtin:default': defaultBuiltInProcessor, + 'builtin:default': decorateProcessor(defaultBuiltInProcessor, 'dataframe:function-call'), 'builtin:apply': processApply, - 'builtin:expression-list': processExpressionList, + 'builtin:expression-list': decorateProcessor(processExpressionList, 'dataframe:expression-list'), 'builtin:source': processSourceCall, - 'builtin:access': processAccess, + 'builtin:access': decorateProcessor(processAccess, 'dataframe:access'), 'builtin:if-then-else': processIfThenElse, 'builtin:get': processGet, 'builtin:rm': processRm, 'builtin:library': processLibrary, - 'builtin:assignment': processAssignment, + 'builtin:assignment': decorateProcessor(processAssignment, 'dataframe:assignment'), 'builtin:special-bin-op': processSpecialBinOp, 'builtin:pipe': processPipe, 'builtin:function-definition': processFunctionDefinition, @@ -152,11 +179,21 @@ export const BuiltInProcessorMapper = { 'builtin:replacement': processReplacementFunction, 'builtin:list': processList, 'builtin:vector': processVector, -} as const satisfies Record<`builtin:${string}`, BuiltInIdentifierProcessorWithConfig>; +// eslint-disable-next-line @typescript-eslint/no-explicit-any +} as const satisfies Record<`builtin:${string}`, BuiltInIdentifierProcessorWithConfig>; export type BuiltInMappingName = keyof typeof BuiltInProcessorMapper; export type ConfigOfBuiltInMappingName = Parameters[4]; +const BuiltInProcessorDecoratorMapper = { + 'dataframe:function-call': processDataFrameFunctionCall, + 'dataframe:access': processDataFrameAccess, + 'dataframe:assignment': processDataFrameAssignment, + 'dataframe:expression-list': processDataFrameExpressionList +} as const satisfies Record<`${string}:${string}`, BuiltInIdentifierProcessorDecorator>; + +type BuiltInProcessorDecoratorName = keyof typeof BuiltInProcessorDecoratorMapper; + export const BuiltInMemory = new Map(); export const EmptyBuiltInMemory = new Map(); diff --git a/src/dataflow/environments/resolve-by-name.ts b/src/dataflow/environments/resolve-by-name.ts index 548009a6d0..23eac76cc1 100644 --- a/src/dataflow/environments/resolve-by-name.ts +++ b/src/dataflow/environments/resolve-by-name.ts @@ -16,6 +16,8 @@ import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; import { VisitingQueue } from '../../slicing/static/visiting-queue'; import { envFingerprint } from '../../slicing/static/fingerprint'; import { EdgeType } from '../graph/edge'; +import { EmptyArgument } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { RNumberValue } from '../../r-bridge/lang-4.x/convert-values'; const FunctionTargetTypes = ReferenceType.Function | ReferenceType.BuiltInFunction | ReferenceType.Unknown | ReferenceType.Argument | ReferenceType.Parameter; @@ -345,7 +347,45 @@ export function resolveIdToValue(id: NodeId | RNodeWithParent, { environment, gr case RType.Number: case RType.Logical: return [node.content]; + case RType.BinaryOp: + if(full && node.operator === ':' && (node.lhs.type === RType.Number || node.lhs.type === RType.Symbol) && (node.rhs.type === RType.Symbol || node.rhs.type === RType.Number)) { + const leftArg = resolveIdToValue(node.lhs.info.id, { environment, graph, idMap, full }); + const rightArg = resolveIdToValue(node.rhs.info.id, { environment, graph, idMap, full }); + const leftValue = leftArg?.length === 1 ? leftArg[0] : undefined; + const rightValue = rightArg?.length === 1 ? rightArg[0] : undefined; + + if(isRNumberValue(leftValue) && isRNumberValue(rightValue)) { + return [createNumberSequence(leftValue, rightValue)]; + } + } + return undefined; + case RType.FunctionCall: + if(full && node.named && node.functionName.content === 'c') { + const elements = node.arguments.map(arg => arg !== EmptyArgument && arg.value ? resolveIdToValue(arg.value.info.id, { environment, graph, idMap, full }) : undefined); + + return [elements.map(element => element?.length === 1 ? element[0] : undefined).flat()]; + } + return undefined; default: return undefined; } +} + +function createNumberSequence(start: RNumberValue, end: RNumberValue): RNumberValue[] { + const sequence: number[] = []; + const min = Math.min(start.num, end.num); + const max = Math.max(start.num, end.num); + + for(let i = min; i <= max; i++) { + sequence.push(i); + } + if(start > end) { + sequence.reverse(); + } + + return sequence.map(value => ({ ...start, num: value })); +} + +function isRNumberValue(value: unknown): value is RNumberValue { + return typeof value === 'object' && value !== null && 'num' in value && typeof value.num === 'number'; } \ No newline at end of file diff --git a/test/functionality/abstract-interpretation/data-frame/data-frame.ts b/test/functionality/abstract-interpretation/data-frame/data-frame.ts new file mode 100644 index 0000000000..cb3d9cca23 --- /dev/null +++ b/test/functionality/abstract-interpretation/data-frame/data-frame.ts @@ -0,0 +1,173 @@ +import { assert, test } from 'vitest'; +import type { DataFrameDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; +import { DataFrameTop, leqColNames, leqInterval } from '../../../../src/abstract-interpretation/data-frame/domain'; +import type { AbstractInterpretationInfo } from '../../../../src/abstract-interpretation/data-frame/absint-info'; +import { PipelineExecutor } from '../../../../src/core/pipeline-executor'; +import { DEFAULT_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; +import { RType } from '../../../../src/r-bridge/lang-4.x/ast/model/type'; +import { requestFromInput } from '../../../../src/r-bridge/retriever'; +import type { RShell } from '../../../../src/r-bridge/shell'; +import type { SingleSlicingCriterion } from '../../../../src/slicing/criterion/parse'; +import { slicingCriterionToId } from '../../../../src/slicing/criterion/parse'; +import { assertUnreachable } from '../../../../src/util/assert'; +import { getRangeEnd } from '../../../../src/util/range'; +import type { RSymbol } from '../../../../src/r-bridge/lang-4.x/ast/model/nodes/r-symbol'; + +export enum DomainMatchingType { + Exact = 'exact', + Overapproximation = 'overapproximation' +} + +export type DataFrameTestOptions = Record; + +export const DataFrameTestExact = { + colnames: DomainMatchingType.Exact, + cols: DomainMatchingType.Exact, + rows: DomainMatchingType.Exact +}; + +export const DataFrameTestOverapproximation = { + colnames: DomainMatchingType.Overapproximation, + cols: DomainMatchingType.Overapproximation, + rows: DomainMatchingType.Overapproximation +}; + +export function assertDataFrameDomain( + shell: RShell, + code: string, + criterion: SingleSlicingCriterion, + expected: DataFrameDomain, + name: string = code +) { + test(name, async()=> { + const [value] = await getInferredDomainForCriterion(shell, code, criterion); + + assert.deepStrictEqual(value.colnames, expected.colnames, 'column names differ'); + assert.deepStrictEqual(value.cols, expected.cols, 'column count differs'); + assert.deepStrictEqual(value.rows, expected.rows, 'row count differs'); + }); +} + +export function testDataFrameDomain( + shell: RShell, + code: string, + criterion: SingleSlicingCriterion, + /** Whether the inferred properties should match exacly the actual properties or can be an over-approximation (defaults to exact for all properties) */ + options?: Partial, + name: string = code +): void { + const effectiveOptions = { ...DataFrameTestExact, ...options }; + test(name, async()=> { + const [value, node] = await getInferredDomainForCriterion(shell, code, criterion); + const lineNumber = getRangeEnd(node.location)?.[0]; + + if(lineNumber === undefined) { + throw new Error(`cannot resolve line of criterion ${criterion}`); + } + const lines = code.split('\n'); + const outputCode = [ + createCodeForOutput('colnames', criterion, node.content), + createCodeForOutput('cols', criterion, node.content), + createCodeForOutput('rows', criterion, node.content) + ]; + lines.splice(lineNumber + 1, 0, ...outputCode); + const instrumentedCode = lines.join('\n'); + + shell.clearEnvironment(); + const output = await shell.sendCommandWithOutput(instrumentedCode); + const colnames = getRealDomainFromOutput('colnames', criterion, output); + const cols = getRealDomainFromOutput('cols', criterion, output); + const rows = getRealDomainFromOutput('rows', criterion, output); + + assertDomainMatching('colnames', value.colnames, colnames, leqColNames, effectiveOptions.colnames); + assertDomainMatching('cols', value.cols, cols, leqInterval, effectiveOptions.cols); + assertDomainMatching('rows', value.rows, rows, leqInterval, effectiveOptions.rows); + }); +} + +function assertDomainMatching( + type: K, + actual: T, + expected: T, + leqFunction: (X1: T, X2: T) => boolean, + matchingType: DomainMatchingType +): void { + switch(matchingType) { + case DomainMatchingType.Exact: + return assert.deepStrictEqual(actual, expected, `${type} differs`); + case DomainMatchingType.Overapproximation: + return assert.isTrue(leqFunction(expected, actual), `${type} is no over-approximation`); + default: + assertUnreachable(matchingType); + } +} + +function createCodeForOutput( + type: keyof DataFrameDomain, + criterion: SingleSlicingCriterion, + symbol: string +): string { + switch(type) { + case 'colnames': + return `cat("${getMarker(type, criterion)}", colnames(${symbol}), "\\n")`; + case 'cols': + return `cat("${getMarker(type, criterion)}", ncol(${symbol}), "\\n")`; + case 'rows': + return `cat("${getMarker(type, criterion)}", nrow(${symbol}), "\\n")`; + default: + assertUnreachable(type); + } +} + +async function getInferredDomainForCriterion( + shell: RShell, + code: string, + criterion: SingleSlicingCriterion +): Promise<[DataFrameDomain, RSymbol]> { + const result = await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, { + parser: shell, + request: requestFromInput(code) + }).allRemainingSteps(); + + const idMap = result.dataflow.graph.idMap ?? result.normalize.idMap; + const nodeId = slicingCriterionToId(criterion, idMap); + const node = idMap.get(nodeId); + + if(node === undefined || node.type !== RType.Symbol) { + throw new Error(`slicing criterion ${criterion} does not refer to a R symbol`); + } + const info = node.info as AbstractInterpretationInfo; + const value = info.dataFrame?.type === 'symbol' ? info.dataFrame.value : DataFrameTop; + + return [value, node]; +} + +function getRealDomainFromOutput( + type: K, + criterion: SingleSlicingCriterion, + output: string[] +): DataFrameDomain[K] { + const marker = getMarker(type, criterion); + const line = output.find(line => line.startsWith(marker))?.replace(marker, '').trim(); + + if(line === undefined) { + throw new Error(`cannot parse output of instrumented code for ${type}`); + } + switch(type) { + case 'colnames': { + const value = line.length > 0 ? line.split(' ') : []; + return value as DataFrameDomain[K]; + } + case 'cols': + case 'rows': { + const value = Number.parseInt(line); + return [value, value] as DataFrameDomain[K]; + } + default: + assertUnreachable(type); + } +} + +function getMarker(type: keyof DataFrameDomain, criterion: SingleSlicingCriterion): string { + return `${type.toUpperCase()} ${criterion}`; +} diff --git a/test/functionality/abstract-interpretation/data-frame/domain.test.ts b/test/functionality/abstract-interpretation/data-frame/domain.test.ts new file mode 100644 index 0000000000..4e44c06820 --- /dev/null +++ b/test/functionality/abstract-interpretation/data-frame/domain.test.ts @@ -0,0 +1,80 @@ +import { assert, describe, test } from 'vitest'; +import type { ColNamesDomain, IntervalDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; +import { ColNamesBottom, ColNamesTop, IntervalBottom, IntervalTop, joinColNames, joinInterval, leqColNames, leqInterval, meetColNames, meetInterval, subtractColNames } from '../../../../src/abstract-interpretation/data-frame/domain'; + +describe('Data Frame Domain', () => { + describe('Column Names Domain', () => { + const toSet = (value: ColNamesDomain) => value === ColNamesTop ? value : new Set(value); + const check = (X1: ColNamesDomain, X2: ColNamesDomain, leq: boolean, join: ColNamesDomain, meet: ColNamesDomain, difference: ColNamesDomain) => { + test(`${JSON.stringify(X1)} ⊑ ${JSON.stringify(X2)}`, () => { + assert.strictEqual(leqColNames(X1, X2), leq); + }); + test(`${JSON.stringify(X1)} ⊔ ${JSON.stringify(X2)}`, () => { + assert.deepStrictEqual(toSet(joinColNames(X1, X2)), toSet(join)); + }); + test(`${JSON.stringify(X1)} ⊓ ${JSON.stringify(X2)}`, () => { + assert.deepStrictEqual(toSet(meetColNames(X1, X2)), toSet(meet)); + }); + test(`${JSON.stringify(X1)} ∖ ${JSON.stringify(X2)}`, () => { + assert.deepStrictEqual(toSet(subtractColNames(X1, X2)), toSet(difference)); + }); + }; + check(ColNamesBottom, ColNamesBottom, true, ColNamesBottom, ColNamesBottom, ColNamesBottom); + check(ColNamesTop, ColNamesTop, true, ColNamesTop, ColNamesTop, ColNamesBottom); + check(ColNamesBottom, ColNamesTop, true, ColNamesTop, ColNamesBottom, ColNamesBottom); + check(ColNamesTop, ColNamesBottom, false, ColNamesTop, ColNamesBottom, ColNamesTop); + check(ColNamesBottom, ['id', 'age'], true, ['id', 'age'], ColNamesBottom, ColNamesBottom); + check(['id', 'age'], ColNamesBottom, false, ['id', 'age'], ColNamesBottom, ['id', 'age']); + check(['id', 'age'], ['id', 'age'], true, ['id', 'age'], ['id', 'age'], ColNamesBottom); + check(['id', 'age'], ['id', 'age', 'score'], true, ['id', 'age', 'score'], ['id', 'age'], ColNamesBottom); + check(['id', 'age', 'score'], ['id', 'age'], false, ['id', 'age', 'score'], ['id', 'age'], ['score']); + check(['id', 'age', 'score'], ['id', 'category'], false, ['id', 'age', 'score', 'category'], ['id'], ['age', 'score']); + check(['id', 'category'], ['id', 'age', 'score'], false, ['id', 'age', 'score', 'category'], ['id'], ['category']); + check(['id', 'age'], ColNamesTop, true, ColNamesTop, ['id', 'age'], ColNamesBottom); + check(ColNamesTop, ['id', 'age'], false, ColNamesTop, ['id', 'age'], ColNamesTop); + }); + + describe('Interval Domain', () => { + const check = (X1: IntervalDomain, X2: IntervalDomain, leq: boolean, join: IntervalDomain, meet: IntervalDomain) => { + test(`${JSON.stringify(X1)} ⊑ ${JSON.stringify(X2)}`, () => { + assert.strictEqual(leqInterval(X1, X2), leq); + }); + test(`${JSON.stringify(X1)} ⊔ ${JSON.stringify(X2)}`, () => { + assert.deepStrictEqual(joinInterval(X1, X2), join); + }); + test(`${JSON.stringify(X1)} ⊓ ${JSON.stringify(X2)}`, () => { + assert.deepStrictEqual(meetInterval(X1, X2), meet); + }); + }; + check(IntervalBottom, IntervalBottom, true, IntervalBottom, IntervalBottom); + check(IntervalTop, IntervalTop, true, IntervalTop, IntervalTop); + check(IntervalBottom, IntervalTop, true, IntervalTop, IntervalBottom); + check(IntervalTop, IntervalBottom, false, IntervalTop, IntervalBottom); + check(IntervalBottom, [2, 2], true, [2, 2], IntervalBottom); + check([2, 2], IntervalBottom, false, [2, 2], IntervalBottom); + check(IntervalBottom, [2, 8], true, [2, 8], IntervalBottom); + check([2, 8], IntervalBottom, false, [2, 8], IntervalBottom); + check([2, 8], [0, 4], false, [0, 8], [2, 4]); + check([0, 4], [2, 8], false, [0, 8], [2, 4]); + check([2, 8], [4, 12], false, [2, 12], [4, 8]); + check([4, 12], [2, 8], false, [2, 12], [4, 8]); + check([2, 8], [8, Infinity], false, [2, Infinity], [8, 8]); + check([8, Infinity], [2, 8], false, [2, Infinity], [8, 8]); + check([2, 8], [2, 4], false, [2, 8], [2, 4]); + check([2, 4], [2, 8], true, [2, 8], [2, 4]); + check([2, 8], [2, 2], false, [2, 8], [2, 2]); + check([2, 2], [2, 8], true, [2, 8], [2, 2]); + check([2, 8], [0, 0], false, [0, 8], IntervalBottom); + check([0, 0], [2, 8], false, [0, 8], IntervalBottom); + check([2, 8], [10, 12], false, [2, 12], IntervalBottom); + check([10, 12], [2, 8], false, [2, 12], IntervalBottom); + check([0, 0], [12, Infinity], false, IntervalTop, IntervalBottom); + check([12, Infinity], [0, 0], false, IntervalTop, IntervalBottom); + check([4, Infinity], [12, Infinity], false, [4, Infinity], [12, Infinity]); + check([12, Infinity], [4, Infinity], true, [4, Infinity], [12, Infinity]); + check([2, 8], IntervalTop, true, IntervalTop, [2, 8]); + check(IntervalTop, [2, 8], false, IntervalTop, [2, 8]); + check([12, Infinity], IntervalTop, true, IntervalTop, [12, Infinity]); + check(IntervalTop, [12, Infinity], false, IntervalTop, [12, Infinity]); + }); +}); diff --git a/test/functionality/abstract-interpretation/data-frame/inference.test.ts b/test/functionality/abstract-interpretation/data-frame/inference.test.ts new file mode 100644 index 0000000000..527810c6bf --- /dev/null +++ b/test/functionality/abstract-interpretation/data-frame/inference.test.ts @@ -0,0 +1,120 @@ +import { describe } from 'vitest'; +import { withShell } from '../../_helper/shell'; +import { ColNamesTop, DataFrameTop } from '../../../../src/abstract-interpretation/data-frame/domain'; +import { testDataFrameDomain, assertDataFrameDomain, DomainMatchingType, DataFrameTestOverapproximation } from './data-frame'; + +describe.sequential('Data Frame Abstract Interpretation', withShell(shell => { + assertDataFrameDomain( + shell, + 'df <- data.frame(id = 1:5, age = c(25, 32, 35, 40, 45), score = c(90, 85, 88, 92, 95), row.names = NULL)', + '1@df', + { + colnames: ['id', 'age', 'score'], + cols: [3, 3], + rows: [5, 5] + } + ); + + testDataFrameDomain( + shell, + 'df <- data.frame(id = 1:5, age = c(25, 32, 35, 40, 45), score = c(90, 85, 88, 92, 95), row.names = NULL)', + '1@df' + ); + + assertDataFrameDomain( + shell, + 'df <- data.frame(id = c(1, 2, 3, 5, 6, 7), category = c("A", "B", "A", "A", "B", "B"))', + '1@df', + { + colnames: ['id', 'category'], + cols: [2, 2], + rows: [6, 6] + } + ); + + testDataFrameDomain( + shell, + 'df <- data.frame(id = c(1, 2, 3, 5, 6, 7), category = c("A", "B", "A", "A", "B", "B"))', + '1@df' + ); + + assertDataFrameDomain( + shell, + 'df <- data.frame(c(1, 2, 3:5, c(6, 7, c(8, 9))), c("a", "b", "c"))', + '1@df', + { + colnames: ColNamesTop, + cols: [2, 2], + rows: [9, 9] + } + ); + + testDataFrameDomain( + shell, + 'df <- data.frame(c(1, 2, 3:5, c(6, 7, c(8, 9))), c("a", "b", "c"))', + '1@df', + { colnames: DomainMatchingType.Overapproximation } + ); + + assertDataFrameDomain( + shell, + 'df <- data.frame()', + '1@df', + { + colnames: [], + cols: [0, 0], + rows: [0, 0] + } + ); + + testDataFrameDomain( + shell, + 'df <- data.frame()', + '1@df' + ); + + assertDataFrameDomain( + shell, + 'df1 <- data.frame(id = 1:5); df2 <- df1', + '1@df2', + { + colnames: ['id'], + cols: [1, 1], + rows: [5, 5] + } + ); + + testDataFrameDomain( + shell, + 'df1 <- data.frame(id = 1:5); df2 <- df1', + '1@df2' + ); + + assertDataFrameDomain( + shell, + 'df <- read.csv("test.csv")', + '1@df', + DataFrameTop + ); + + testDataFrameDomain( + shell, + 'df <- read.csv(text = "id,age\\n1,30\\n2,50\\n3,45")', + '1@df', + DataFrameTestOverapproximation + ); + + assertDataFrameDomain( + shell, + 'df <- eval(parse(text = "data.frame()"))', + '1@df', + DataFrameTop + ); + + testDataFrameDomain( + shell, + 'df <- eval(parse(text = "data.frame()"))', + '1@df', + DataFrameTestOverapproximation + ); +})); From 30ea0dbc61da06c01a069f155e0a46c1990785b8 Mon Sep 17 00:00:00 2001 From: Oliver Date: Mon, 17 Mar 2025 15:38:58 +0100 Subject: [PATCH 02/11] feat: restructure absint info to capture state for each node --- .../data-frame/absint-info.ts | 22 ++-- .../data-frame/domain.ts | 26 +++++ .../data-frame/process/data-frame-access.ts | 24 +++- .../process/data-frame-assignment.ts | 17 +++ .../process/data-frame-expression-list.ts | 22 +--- .../process/data-frame-function-call.ts | 40 ++++--- .../data-frame/semantics.ts | 110 +++++++++++++----- src/dataflow/environments/resolve-by-name.ts | 5 +- .../data-frame/data-frame.ts | 2 +- 9 files changed, 184 insertions(+), 84 deletions(-) diff --git a/src/abstract-interpretation/data-frame/absint-info.ts b/src/abstract-interpretation/data-frame/absint-info.ts index 0df1554cae..5337b414b3 100644 --- a/src/abstract-interpretation/data-frame/absint-info.ts +++ b/src/abstract-interpretation/data-frame/absint-info.ts @@ -5,32 +5,30 @@ import type { DataFrameOperationName } from './semantics'; export interface DataFrameOperation { operation: DataFrameOperationName, operand: NodeId | undefined, - arguments: (NodeId | undefined)[] + arguments: (NodeId | undefined)[], + modify?: boolean } -interface DataFrameStatementInfo { - type: 'statement', - domain: Map +interface DataFrameInfo { + type: string; + domain?: Map } -interface DataFrameAssignmentInfo { +export interface DataFrameAssignmentInfo extends DataFrameInfo { type: 'assignment', identifier: NodeId, expression: NodeId } -interface DataFrameExpressionInfo { +export interface DataFrameExpressionInfo extends DataFrameInfo { type: 'expression', operations: DataFrameOperation[] } -interface DataFrameSymbolInfo { - type: 'symbol', - value: DataFrameDomain +export interface DataFrameOtherInfo extends DataFrameInfo { + type: 'other' } -type DataFrameInfo = DataFrameStatementInfo | DataFrameAssignmentInfo | DataFrameExpressionInfo | DataFrameSymbolInfo; - export interface AbstractInterpretationInfo { - dataFrame?: DataFrameInfo + dataFrame?: DataFrameAssignmentInfo | DataFrameExpressionInfo | DataFrameOtherInfo } diff --git a/src/abstract-interpretation/data-frame/domain.ts b/src/abstract-interpretation/data-frame/domain.ts index 264710f2d4..e3a97b6d50 100644 --- a/src/abstract-interpretation/data-frame/domain.ts +++ b/src/abstract-interpretation/data-frame/domain.ts @@ -85,3 +85,29 @@ export function meetInterval(X1: IntervalDomain, X2: IntervalDomain): IntervalDo return [Math.max(X1[0], X2[0]), Math.min(X1[1], X2[1])]; } } + +export function joinDataFrames(...values: DataFrameDomain[]) { + let value = values[0] ?? DataFrameTop; + + for(let i = 1; i < values.length; i++) { + value = { + colnames: joinColNames(value.colnames, values[i].colnames), + cols: joinInterval(value.cols, values[i].cols), + rows: joinInterval(value.rows, values[i].rows) + }; + } + return value; +} + +export function meetDataFrames(...values: DataFrameDomain[]) { + let value = values[0] ?? DataFrameTop; + + for(let i = 1; i < values.length; i++) { + value = { + colnames: meetColNames(value.colnames, values[i].colnames), + cols: meetInterval(value.cols, values[i].cols), + rows: meetInterval(value.rows, values[i].rows) + }; + } + return value; +} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-access.ts b/src/abstract-interpretation/data-frame/process/data-frame-access.ts index 732972282f..1d8c548869 100644 --- a/src/abstract-interpretation/data-frame/process/data-frame-access.ts +++ b/src/abstract-interpretation/data-frame/process/data-frame-access.ts @@ -16,6 +16,8 @@ export function processDataFrameAccess( ) { if(config.treatIndicesAsString) { processDataFrameStringBasedAccess(name, args); + } else { + processDataFrameUnknownAccess(name, args); } } @@ -24,16 +26,32 @@ function processDataFrameStringBasedAccess( args: readonly RFunctionArgument[] ) { const leftArg = args[0] !== EmptyArgument ? args[0] : undefined; - const rightArg = args[1] !== EmptyArgument ? args[1]: undefined; + const rightArg = args[1] !== EmptyArgument ? args[1] : undefined; - if(args.length === 2 && leftArg !== undefined && rightArg !== undefined) { + if(args.length === 2 && leftArg?.value !== undefined && rightArg !== undefined) { name.info.dataFrame = { type: 'expression', operations: [{ operation: 'accessCol', - operand: leftArg.info.id, + operand: leftArg.value.info.id, arguments: [rightArg.info.id] }] }; + } else { + processDataFrameUnknownAccess(name, args); } } + +function processDataFrameUnknownAccess( + name: RSymbol, + args: readonly RFunctionArgument[] +) { + name.info.dataFrame = { + type: 'expression', + operations: [{ + operation: 'unknown', + operand: args[0] !== EmptyArgument ? args[0]?.value?.info.id : undefined, + arguments: args.slice(1).map(arg => arg !== EmptyArgument ? arg.info.id : undefined) + }] + }; +} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-assignment.ts b/src/abstract-interpretation/data-frame/process/data-frame-assignment.ts index ebb2f82a79..fa1bfcdd73 100644 --- a/src/abstract-interpretation/data-frame/process/data-frame-assignment.ts +++ b/src/abstract-interpretation/data-frame/process/data-frame-assignment.ts @@ -18,5 +18,22 @@ export function processDataFrameAssignment( identifier: leftArg.value.info.id, expression: rightArg.value.info.id }; + } else { + processDataFrameUnknownAssignment(name, args); } } + +function processDataFrameUnknownAssignment( + name: RSymbol, + args: readonly RFunctionArgument[] +) { + name.info.dataFrame = { + type: 'expression', + operations: [{ + operation: 'unknown', + operand: args[0] !== EmptyArgument ? args[0]?.value?.info.id : undefined, + arguments: args.slice(1).map(arg => arg !== EmptyArgument ? arg.info.id : undefined), + modify: true + }] + }; +} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts b/src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts index 2172c51afb..cb18997eaf 100644 --- a/src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts +++ b/src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts @@ -4,10 +4,9 @@ import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-func import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id'; -import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; import type { DataFrameDomain } from '../domain'; import type { AbstractInterpretationInfo } from '../absint-info'; -import { applyExpressionSemantics } from '../semantics'; +import { applySemantics } from '../semantics'; export function processDataFrameExpressionList( name: RSymbol, @@ -15,25 +14,12 @@ export function processDataFrameExpressionList( rootId: NodeId, data: DataflowProcessorInformation ) { + const resolveInfo = { environment: data.environment, idMap: data.completeAst.idMap, full: true }; const domain: Map = new Map(); for(const arg of args) { - if(arg !== EmptyArgument && arg.value?.info.dataFrame?.type === 'assignment') { - const resolveInfo = { environment: data.environment, idMap: data.completeAst.idMap, full: true }; - const identifier = resolveInfo.idMap.get(arg.value.info.dataFrame.identifier); - const expression = resolveInfo.idMap.get(arg.value.info.dataFrame.expression); - - if(identifier?.type === RType.Symbol && expression !== undefined) { - const dataFrameDomain = applyExpressionSemantics(expression, domain, resolveInfo); - - if(dataFrameDomain !== undefined) { - domain.set(identifier.info.id, dataFrameDomain); - identifier.info.dataFrame = { - type: 'symbol', - value: dataFrameDomain - }; - } - } + if(arg !== EmptyArgument && arg.value !== undefined) { + applySemantics(arg.value, domain, resolveInfo); } } } diff --git a/src/abstract-interpretation/data-frame/process/data-frame-function-call.ts b/src/abstract-interpretation/data-frame/process/data-frame-function-call.ts index 6cc8447d88..3c14b81960 100644 --- a/src/abstract-interpretation/data-frame/process/data-frame-function-call.ts +++ b/src/abstract-interpretation/data-frame/process/data-frame-function-call.ts @@ -6,15 +6,12 @@ import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/pro import type { AbstractInterpretationInfo } from '../absint-info'; const DataFrameFunctionMapper = { - 'data.frame': processDataFrameCreate, - 'as.data.frame': processDataFrameUnknownCreate, - 'read.csv': processDataFrameUnknownCreate, - 'read.table': processDataFrameUnknownCreate + 'data.frame': processDataFrameCreate } as const satisfies Record>; -const DataFrameSpecialArgumentsMapper = { - 'data.frame': ['row.names', 'check.rows', 'check.names', 'fix.empty.names', 'stringsAsFactors'] as string[] -} as const satisfies Partial>; +const DataFrameSpecialArgumentsMapper: Partial> = { + 'data.frame': ['row.names', 'check.rows', 'check.names', 'fix.empty.names', 'stringsAsFactors'] +}; type DataFrameFunction = keyof typeof DataFrameFunctionMapper; @@ -23,7 +20,11 @@ export function processDataFrameFunctionCall( args: readonly RFunctionArgument[] ) { if(name.content in DataFrameFunctionMapper) { - DataFrameFunctionMapper[name.content as DataFrameFunction]?.(name, args); + const functionName = name.content as DataFrameFunction; + const functionProcessor = DataFrameFunctionMapper[functionName]; + functionProcessor(name, getEffectiveArgs(functionName, args)); + } else { + processDataFrameUnknownCall(name, args); } } @@ -36,29 +37,30 @@ function processDataFrameCreate( operations: [{ operation: 'create', operand: undefined, - arguments: args - .filter(arg => !isSpecialArgument('data.frame', arg)) - .map(arg => arg !== EmptyArgument ? arg.info.id : undefined) + arguments: args.map(arg => arg !== EmptyArgument ? arg.info.id : undefined) }] }; } -function processDataFrameUnknownCreate( - name: RSymbol +function processDataFrameUnknownCall( + name: RSymbol, + args: readonly RFunctionArgument[] ) { name.info.dataFrame = { type: 'expression', operations: [{ operation: 'unknown', operand: undefined, - arguments: [] + arguments: args.map(arg => arg !== EmptyArgument ? arg.info.id : undefined) }] }; } -function isSpecialArgument(funct: keyof typeof DataFrameSpecialArgumentsMapper, argument: RFunctionArgument) { - if(argument === EmptyArgument || argument.name === undefined) { - return false; - } - return DataFrameSpecialArgumentsMapper[funct].includes(argument.name.content); +function getEffectiveArgs( + funct: DataFrameFunction, + args: readonly RFunctionArgument[] +): readonly RFunctionArgument[] { + const specialArgs = DataFrameSpecialArgumentsMapper[funct] ?? []; + + return args.filter(arg => arg === EmptyArgument || arg.name === undefined || !specialArgs.includes(arg.name.content)); } diff --git a/src/abstract-interpretation/data-frame/semantics.ts b/src/abstract-interpretation/data-frame/semantics.ts index 6af3654624..159b245fe8 100644 --- a/src/abstract-interpretation/data-frame/semantics.ts +++ b/src/abstract-interpretation/data-frame/semantics.ts @@ -5,8 +5,8 @@ import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/proces import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; import type { DataFrameDomain, ColNamesDomain } from './domain'; -import { DataFrameTop, ColNamesTop, IntervalTop, ColNamesBottom, joinColNames } from './domain'; -import type { AbstractInterpretationInfo, DataFrameOperation } from './absint-info'; +import { DataFrameTop, ColNamesTop, IntervalTop, ColNamesBottom, joinColNames, joinDataFrames } from './domain'; +import type { AbstractInterpretationInfo, DataFrameAssignmentInfo, DataFrameExpressionInfo, DataFrameOperation } from './absint-info'; import { resolveIdToArgName, resolveIdToArgVectorLength, resolveIdToArgValueSymbolName } from './resolve-args'; const DataFrameSemanticsMapper = { @@ -18,42 +18,82 @@ const DataFrameSemanticsMapper = { type DataFrameSemanticsApplier = (value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo) => DataFrameDomain; export type DataFrameOperationName = keyof typeof DataFrameSemanticsMapper; -export function applyExpressionSemantics( +export function applySemantics( node: RNode, domain: Map, - resolveInfo : ResolveInfo -): DataFrameDomain | undefined { - if(node.type === RType.FunctionCall && node.named && node.functionName.info.dataFrame?.type === 'expression') { - let dataFrameDomain: DataFrameDomain = DataFrameTop; - - for(const operation of node.functionName.info.dataFrame.operations) { - if(operation.operand === undefined) { - const semanticsApplier = DataFrameSemanticsMapper[operation.operation]; - dataFrameDomain = semanticsApplier(dataFrameDomain, operation, resolveInfo); - } else { - const operand = resolveInfo.idMap?.get(operation.operand); - const operandDomain = operand ? applyExpressionSemantics(operand, domain, resolveInfo) ?? DataFrameTop : DataFrameTop; - const semanticsApplier = DataFrameSemanticsMapper[operation.operation]; - dataFrameDomain = semanticsApplier(operandDomain, operation, resolveInfo); - } - } - return dataFrameDomain; + resolveInfo : ResolveInfo +): DataFrameDomain { + let dataFrameDomain = DataFrameTop; + + if(isAssignment(node)) { + dataFrameDomain = applyAssignmentSemantics(node, domain, resolveInfo); + } else if(isExpression(node)) { + dataFrameDomain = applyExpressionSemantics(node, domain, resolveInfo); + } else if(node.type === RType.FunctionCall && node.named) { + dataFrameDomain = applySemantics(node.functionName, domain, resolveInfo); } else if(node.type === RType.Symbol && resolveInfo.environment !== undefined) { const identifiers = resolveByName(node.content, resolveInfo.environment); + const values = identifiers?.map(id => domain.get(id.nodeId) ?? DataFrameTop); + dataFrameDomain = values ? joinDataFrames(...values) : DataFrameTop; + } + node.info.dataFrame ??= { type: 'other' }; + node.info.dataFrame.domain = new Map(domain); + + return dataFrameDomain; +} + +function applyAssignmentSemantics( + node: RNode, + domain: Map, + resolveInfo : ResolveInfo +): DataFrameDomain { + let dataFrameDomain = DataFrameTop; + + const identifier = resolveInfo.idMap?.get(node.info.dataFrame.identifier); + const expression = resolveInfo.idMap?.get(node.info.dataFrame.expression); + + if(identifier?.type === RType.Symbol && expression !== undefined) { + dataFrameDomain = applySemantics(expression, domain, resolveInfo); + domain.set(identifier.info.id, dataFrameDomain); + } + if(identifier !== undefined) { + identifier.info.dataFrame ??= { type: 'other' }; + identifier.info.dataFrame.domain = new Map(domain); + } + return dataFrameDomain; +} + +function applyExpressionSemantics( + node: RNode, + domain: Map, + resolveInfo : ResolveInfo +): DataFrameDomain { + let dataFrameDomain = DataFrameTop; - if(identifiers?.length === 1) { - const dataFrameDomain = domain.get(identifiers[0].nodeId); + for(const operation of node.info.dataFrame.operations) { + if(operation.operand === undefined) { + const semanticsApplier = DataFrameSemanticsMapper[operation.operation]; + dataFrameDomain = semanticsApplier(dataFrameDomain, operation, resolveInfo); + } else { + const operand = resolveInfo.idMap?.get(operation.operand); + const operandDomain = operand ? applySemantics(operand, domain, resolveInfo) : DataFrameTop; + const semanticsApplier = DataFrameSemanticsMapper[operation.operation]; + dataFrameDomain = semanticsApplier(operandDomain, operation, resolveInfo); - if(dataFrameDomain !== undefined) { - node.info.dataFrame = { - type: 'symbol', - value: dataFrameDomain - }; + if(operand !== undefined && operation.modify) { + let origins = [operand.info.id]; + + if(operand.type === RType.Symbol && resolveInfo.environment !== undefined) { + const identifiers = resolveByName(operand.content, resolveInfo.environment); + origins = identifiers?.map(id => id.nodeId) ?? origins; + } + for(const origin of origins) { + domain.set(origin, dataFrameDomain); + } } - return dataFrameDomain; } } - return undefined; + return dataFrameDomain; } function applyCreateSemantics(value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo): DataFrameDomain { @@ -82,3 +122,15 @@ function applyAccessColSemantics(value: DataFrameDomain, event: DataFrameOperati function applyUnknownSemantics(): DataFrameDomain { return DataFrameTop; } + +function isAssignment( + node: RNode +): node is RNode { + return node.info.dataFrame?.type === 'assignment'; +} + +function isExpression( + node: RNode +): node is RNode { + return node.info.dataFrame?.type === 'expression'; +} diff --git a/src/dataflow/environments/resolve-by-name.ts b/src/dataflow/environments/resolve-by-name.ts index 23eac76cc1..f3bffa645c 100644 --- a/src/dataflow/environments/resolve-by-name.ts +++ b/src/dataflow/environments/resolve-by-name.ts @@ -18,6 +18,7 @@ import { envFingerprint } from '../../slicing/static/fingerprint'; import { EdgeType } from '../graph/edge'; import { EmptyArgument } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; import type { RNumberValue } from '../../r-bridge/lang-4.x/convert-values'; +import type { NoInfo } from '../../r-bridge/lang-4.x/ast/model/model'; const FunctionTargetTypes = ReferenceType.Function | ReferenceType.BuiltInFunction | ReferenceType.Unknown | ReferenceType.Argument | ReferenceType.Parameter; @@ -308,11 +309,11 @@ export function resolveValueOfVariable(identifier: Identifier | undefined, envir } } -export interface ResolveInfo { +export interface ResolveInfo { /** The current environment used for name resolution */ environment?: REnvironmentInformation; /** The id map to resolve the node if given as an id */ - idMap?: AstIdMap; + idMap?: AstIdMap; /** The graph to resolve in */ graph?: DataflowGraph; /** Whether to track variables */ diff --git a/test/functionality/abstract-interpretation/data-frame/data-frame.ts b/test/functionality/abstract-interpretation/data-frame/data-frame.ts index cb3d9cca23..fc868cb1e5 100644 --- a/test/functionality/abstract-interpretation/data-frame/data-frame.ts +++ b/test/functionality/abstract-interpretation/data-frame/data-frame.ts @@ -137,7 +137,7 @@ async function getInferredDomainForCriterion( throw new Error(`slicing criterion ${criterion} does not refer to a R symbol`); } const info = node.info as AbstractInterpretationInfo; - const value = info.dataFrame?.type === 'symbol' ? info.dataFrame.value : DataFrameTop; + const value = info.dataFrame?.domain?.get(node.info.id) ?? DataFrameTop; return [value, node]; } From 9016a00a8ce442817945c86731aca85d4d62b143 Mon Sep 17 00:00:00 2001 From: Oliver Date: Mon, 17 Mar 2025 16:30:31 +0100 Subject: [PATCH 03/11] feat: add data frame processor for pipe operator --- .../data-frame/process/data-frame-pipe.ts | 35 +++++++++++++++++++ src/dataflow/environments/built-in.ts | 4 ++- 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 src/abstract-interpretation/data-frame/process/data-frame-pipe.ts diff --git a/src/abstract-interpretation/data-frame/process/data-frame-pipe.ts b/src/abstract-interpretation/data-frame/process/data-frame-pipe.ts new file mode 100644 index 0000000000..813bcd1284 --- /dev/null +++ b/src/abstract-interpretation/data-frame/process/data-frame-pipe.ts @@ -0,0 +1,35 @@ +import { EmptyArgument, type RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; +import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; +import type { AbstractInterpretationInfo } from '../absint-info'; +import { processDataFrameFunctionCall } from './data-frame-function-call'; + +export function processDataFramePipe( + name: RSymbol, + args: readonly RFunctionArgument[] +): void { + const leftArg = args[0] !== EmptyArgument ? args[0] : undefined; + const rightArg = args[1] !== EmptyArgument ? args[1] : undefined; + + if(leftArg !== undefined && rightArg?.value?.type === RType.FunctionCall && rightArg.value.named) { + processDataFrameFunctionCall(rightArg.value.functionName, [leftArg, ...rightArg.value.arguments]); + name.info.dataFrame = rightArg.value.functionName.info.dataFrame; + } else { + processDataFrameUnknownPipe(name, args); + } +} + +function processDataFrameUnknownPipe( + name: RSymbol, + args: readonly RFunctionArgument[] +) { + name.info.dataFrame = { + type: 'expression', + operations: [{ + operation: 'unknown', + operand: args[0] !== EmptyArgument ? args[0]?.value?.info.id : undefined, + arguments: args.slice(1).map(arg => arg !== EmptyArgument ? arg.info.id : undefined) + }] + }; +} diff --git a/src/dataflow/environments/built-in.ts b/src/dataflow/environments/built-in.ts index 8de00d2d7f..36f5ad51eb 100644 --- a/src/dataflow/environments/built-in.ts +++ b/src/dataflow/environments/built-in.ts @@ -34,6 +34,7 @@ import { processDataFrameAccess } from '../../abstract-interpretation/data-frame import { processDataFrameAssignment } from '../../abstract-interpretation/data-frame/process/data-frame-assignment'; import { processDataFrameExpressionList } from '../../abstract-interpretation/data-frame/process/data-frame-expression-list'; import { processDataFrameFunctionCall } from '../../abstract-interpretation/data-frame/process/data-frame-function-call'; +import { processDataFramePipe } from '../../abstract-interpretation/data-frame/process/data-frame-pipe'; import { processVector } from '../internal/process/functions/call/built-in/built-in-vector'; import { processRm } from '../internal/process/functions/call/built-in/built-in-rm'; @@ -170,7 +171,7 @@ export const BuiltInProcessorMapper = { 'builtin:library': processLibrary, 'builtin:assignment': decorateProcessor(processAssignment, 'dataframe:assignment'), 'builtin:special-bin-op': processSpecialBinOp, - 'builtin:pipe': processPipe, + 'builtin:pipe': decorateProcessor(processPipe, 'dataframe:pipe'), 'builtin:function-definition': processFunctionDefinition, 'builtin:quote': processQuote, 'builtin:for-loop': processForLoop, @@ -189,6 +190,7 @@ const BuiltInProcessorDecoratorMapper = { 'dataframe:function-call': processDataFrameFunctionCall, 'dataframe:access': processDataFrameAccess, 'dataframe:assignment': processDataFrameAssignment, + 'dataframe:pipe': processDataFramePipe, 'dataframe:expression-list': processDataFrameExpressionList } as const satisfies Record<`${string}:${string}`, BuiltInIdentifierProcessorDecorator>; From 16483c9ccf4ede5343b56b065c73ef0cad709b47 Mon Sep 17 00:00:00 2001 From: Oliver Date: Mon, 17 Mar 2025 16:52:29 +0100 Subject: [PATCH 04/11] feat: interface to resolve data frame values --- .../data-frame/abstract-interpretation.ts | 23 +++++++++++++++++++ .../data-frame/data-frame.ts | 7 +++--- 2 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 src/abstract-interpretation/data-frame/abstract-interpretation.ts diff --git a/src/abstract-interpretation/data-frame/abstract-interpretation.ts b/src/abstract-interpretation/data-frame/abstract-interpretation.ts new file mode 100644 index 0000000000..ac772ff91a --- /dev/null +++ b/src/abstract-interpretation/data-frame/abstract-interpretation.ts @@ -0,0 +1,23 @@ +import type { REnvironmentInformation } from '../../dataflow/environments/environment'; +import { resolveByName } from '../../dataflow/environments/resolve-by-name'; +import type { RSymbol } from '../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; +import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { AbstractInterpretationInfo } from './absint-info'; +import type { DataFrameDomain } from './domain'; +import { DataFrameTop, joinDataFrames } from './domain'; + +export function resolveDataFrameValue( + node: RSymbol, + environment: REnvironmentInformation, + identifier: string = node.content +): DataFrameDomain { + const domain = node.info.dataFrame?.domain; + + if(domain === undefined) { + return DataFrameTop; + } + const identifiers = resolveByName(identifier, environment); + const values = identifiers?.map(id => domain.get(id.nodeId) ?? DataFrameTop); + + return values ? joinDataFrames(...values) : DataFrameTop; +} diff --git a/test/functionality/abstract-interpretation/data-frame/data-frame.ts b/test/functionality/abstract-interpretation/data-frame/data-frame.ts index fc868cb1e5..587b5a4a25 100644 --- a/test/functionality/abstract-interpretation/data-frame/data-frame.ts +++ b/test/functionality/abstract-interpretation/data-frame/data-frame.ts @@ -1,7 +1,6 @@ import { assert, test } from 'vitest'; import type { DataFrameDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; -import { DataFrameTop, leqColNames, leqInterval } from '../../../../src/abstract-interpretation/data-frame/domain'; -import type { AbstractInterpretationInfo } from '../../../../src/abstract-interpretation/data-frame/absint-info'; +import { leqColNames, leqInterval } from '../../../../src/abstract-interpretation/data-frame/domain'; import { PipelineExecutor } from '../../../../src/core/pipeline-executor'; import { DEFAULT_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; import { RType } from '../../../../src/r-bridge/lang-4.x/ast/model/type'; @@ -12,6 +11,7 @@ import { slicingCriterionToId } from '../../../../src/slicing/criterion/parse'; import { assertUnreachable } from '../../../../src/util/assert'; import { getRangeEnd } from '../../../../src/util/range'; import type { RSymbol } from '../../../../src/r-bridge/lang-4.x/ast/model/nodes/r-symbol'; +import { resolveDataFrameValue } from '../../../../src/abstract-interpretation/data-frame/abstract-interpretation'; export enum DomainMatchingType { Exact = 'exact', @@ -136,8 +136,7 @@ async function getInferredDomainForCriterion( if(node === undefined || node.type !== RType.Symbol) { throw new Error(`slicing criterion ${criterion} does not refer to a R symbol`); } - const info = node.info as AbstractInterpretationInfo; - const value = info.dataFrame?.domain?.get(node.info.id) ?? DataFrameTop; + const value = resolveDataFrameValue(node, result.dataflow.environment); return [value, node]; } From 4b35f041f98a8f719784bc16246c1132b7b95a33 Mon Sep 17 00:00:00 2001 From: Oliver Date: Tue, 18 Mar 2025 08:53:06 +0100 Subject: [PATCH 05/11] refactor: relocate mapping for expression semantics --- .../data-frame/absint-info.ts | 2 +- .../data-frame/expression-semantics.ts | 40 +++++++++++++++++ .../data-frame/semantics.ts | 44 ++----------------- 3 files changed, 45 insertions(+), 41 deletions(-) create mode 100644 src/abstract-interpretation/data-frame/expression-semantics.ts diff --git a/src/abstract-interpretation/data-frame/absint-info.ts b/src/abstract-interpretation/data-frame/absint-info.ts index 5337b414b3..06d62db241 100644 --- a/src/abstract-interpretation/data-frame/absint-info.ts +++ b/src/abstract-interpretation/data-frame/absint-info.ts @@ -1,6 +1,6 @@ import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; import type { DataFrameDomain } from './domain'; -import type { DataFrameOperationName } from './semantics'; +import type { DataFrameOperationName } from './expression-semantics'; export interface DataFrameOperation { operation: DataFrameOperationName, diff --git a/src/abstract-interpretation/data-frame/expression-semantics.ts b/src/abstract-interpretation/data-frame/expression-semantics.ts new file mode 100644 index 0000000000..c535cc67e3 --- /dev/null +++ b/src/abstract-interpretation/data-frame/expression-semantics.ts @@ -0,0 +1,40 @@ +import type { ResolveInfo } from '../../dataflow/environments/resolve-by-name'; +import type { DataFrameOperation } from './absint-info'; +import { ColNamesBottom, ColNamesTop, DataFrameTop, IntervalTop, joinColNames, type ColNamesDomain, type DataFrameDomain } from './domain'; +import { resolveIdToArgName, resolveIdToArgVectorLength, resolveIdToArgValueSymbolName } from './resolve-args'; + +export const DataFrameSemanticsMapper = { + 'create': applyCreateSemantics, + 'accessCol': applyAccessColSemantics, + 'unknown': applyUnknownSemantics +} as const satisfies Record; + +export type DataFrameOperationName = keyof typeof DataFrameSemanticsMapper; +type DataFrameSemanticsApplier = (value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo) => DataFrameDomain; + +function applyCreateSemantics(value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo): DataFrameDomain { + const argNames = event.arguments.map(arg => arg ? resolveIdToArgName(arg, info) : undefined); + const argLengths = event.arguments.map(arg => arg ? resolveIdToArgVectorLength(arg, info) : undefined); + const colnames = argNames.some(arg => arg === undefined) ? ColNamesTop : argNames as ColNamesDomain; + const rowCount = argLengths.some(arg => arg === undefined) ? undefined : Math.max(...argLengths as number[], 0); + + return { + colnames: colnames, + cols: [event.arguments.length, event.arguments.length], + rows: rowCount !== undefined ? [rowCount, rowCount] : IntervalTop + }; +} + +function applyAccessColSemantics(value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo): DataFrameDomain { + const argNames = event.arguments.map(arg => arg ? resolveIdToArgValueSymbolName(arg, info) : undefined); + const colnames = argNames.some(arg => arg === undefined) ? ColNamesBottom : argNames as ColNamesDomain; + + return { + ...value, + colnames: joinColNames(value.colnames, colnames) + }; +} + +function applyUnknownSemantics(): DataFrameDomain { + return DataFrameTop; +} diff --git a/src/abstract-interpretation/data-frame/semantics.ts b/src/abstract-interpretation/data-frame/semantics.ts index 159b245fe8..1803745139 100644 --- a/src/abstract-interpretation/data-frame/semantics.ts +++ b/src/abstract-interpretation/data-frame/semantics.ts @@ -4,19 +4,10 @@ import type { RNode } from '../../r-bridge/lang-4.x/ast/model/model'; import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; -import type { DataFrameDomain, ColNamesDomain } from './domain'; -import { DataFrameTop, ColNamesTop, IntervalTop, ColNamesBottom, joinColNames, joinDataFrames } from './domain'; -import type { AbstractInterpretationInfo, DataFrameAssignmentInfo, DataFrameExpressionInfo, DataFrameOperation } from './absint-info'; -import { resolveIdToArgName, resolveIdToArgVectorLength, resolveIdToArgValueSymbolName } from './resolve-args'; - -const DataFrameSemanticsMapper = { - 'create': applyCreateSemantics, - 'accessCol': applyAccessColSemantics, - 'unknown': applyUnknownSemantics -} as const satisfies Record; - -type DataFrameSemanticsApplier = (value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo) => DataFrameDomain; -export type DataFrameOperationName = keyof typeof DataFrameSemanticsMapper; +import type { DataFrameDomain } from './domain'; +import { DataFrameTop, joinDataFrames } from './domain'; +import type { AbstractInterpretationInfo, DataFrameAssignmentInfo, DataFrameExpressionInfo } from './absint-info'; +import { DataFrameSemanticsMapper } from './expression-semantics'; export function applySemantics( node: RNode, @@ -96,33 +87,6 @@ function applyExpressionSemantics( return dataFrameDomain; } -function applyCreateSemantics(value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo): DataFrameDomain { - const argNames = event.arguments.map(arg => arg ? resolveIdToArgName(arg, info) : undefined); - const argLengths = event.arguments.map(arg => arg ? resolveIdToArgVectorLength(arg, info) : undefined); - const colnames = argNames.some(arg => arg === undefined) ? ColNamesTop : argNames as ColNamesDomain; - const rowCount = argLengths.some(arg => arg === undefined) ? undefined : Math.max(...argLengths as number[], 0); - - return { - colnames: colnames, - cols: [event.arguments.length, event.arguments.length], - rows: rowCount !== undefined ? [rowCount, rowCount] : IntervalTop - }; -} - -function applyAccessColSemantics(value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo): DataFrameDomain { - const argNames = event.arguments.map(arg => arg ? resolveIdToArgValueSymbolName(arg, info) : undefined); - const colnames = argNames.some(arg => arg === undefined) ? ColNamesBottom : argNames as ColNamesDomain; - - return { - ...value, - colnames: joinColNames(value.colnames, colnames) - }; -} - -function applyUnknownSemantics(): DataFrameDomain { - return DataFrameTop; -} - function isAssignment( node: RNode ): node is RNode { From 27feb0dff9d172746c5bbc884074f73c5b00be97 Mon Sep 17 00:00:00 2001 From: Oliver Date: Tue, 18 Mar 2025 09:32:08 +0000 Subject: [PATCH 06/11] Support multiple criteria in data frame tests (#1478) * test: add support for multiple criteria for data frame tests * test: test label support and clearer test entry type --- test/functionality/_helper/label.ts | 2 +- .../data-frame/data-frame.ts | 79 ++++++++++++------- .../data-frame/inference.test.ts | 74 ++++++----------- 3 files changed, 76 insertions(+), 79 deletions(-) diff --git a/test/functionality/_helper/label.ts b/test/functionality/_helper/label.ts index 30d0ba6d76..11c6828c49 100644 --- a/test/functionality/_helper/label.ts +++ b/test/functionality/_helper/label.ts @@ -20,7 +20,7 @@ function uniqueTestId(): string { } -const _TestLabelContexts = ['parse', 'desugar-shell', 'desugar-tree-sitter', 'dataflow', 'other', 'slice', 'output', 'lineage', 'query', 'search'] as const; +const _TestLabelContexts = ['parse', 'desugar-shell', 'desugar-tree-sitter', 'dataflow', 'other', 'slice', 'output', 'lineage', 'query', 'search', 'absint'] as const; export type TestLabelContext = typeof _TestLabelContexts[number] export interface TestLabel extends MergeableRecord { diff --git a/test/functionality/abstract-interpretation/data-frame/data-frame.ts b/test/functionality/abstract-interpretation/data-frame/data-frame.ts index cb3d9cca23..aa5acf66df 100644 --- a/test/functionality/abstract-interpretation/data-frame/data-frame.ts +++ b/test/functionality/abstract-interpretation/data-frame/data-frame.ts @@ -7,11 +7,13 @@ import { DEFAULT_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/d import { RType } from '../../../../src/r-bridge/lang-4.x/ast/model/type'; import { requestFromInput } from '../../../../src/r-bridge/retriever'; import type { RShell } from '../../../../src/r-bridge/shell'; -import type { SingleSlicingCriterion } from '../../../../src/slicing/criterion/parse'; +import type { SingleSlicingCriterion, SlicingCriteria } from '../../../../src/slicing/criterion/parse'; import { slicingCriterionToId } from '../../../../src/slicing/criterion/parse'; import { assertUnreachable } from '../../../../src/util/assert'; import { getRangeEnd } from '../../../../src/util/range'; import type { RSymbol } from '../../../../src/r-bridge/lang-4.x/ast/model/nodes/r-symbol'; +import { decorateLabelContext, type TestLabel } from '../../_helper/label'; +import type { ParentInformation } from '../../../../src/r-bridge/lang-4.x/ast/model/processing/decorate'; export enum DomainMatchingType { Exact = 'exact', @@ -32,56 +34,75 @@ export const DataFrameTestOverapproximation = { rows: DomainMatchingType.Overapproximation }; +/** Stores the inferred data frame constraints and AST node for a tested slicing criterion */ +interface CriterionTestEntry { + criterion: SingleSlicingCriterion, + value: DataFrameDomain, + node: RSymbol, + lineNumber: number +} + export function assertDataFrameDomain( shell: RShell, code: string, - criterion: SingleSlicingCriterion, - expected: DataFrameDomain, - name: string = code + expected: [SingleSlicingCriterion, DataFrameDomain][], + name: string | TestLabel = code ) { - test(name, async()=> { + test.each(expected)( decorateLabelContext(name, ['absint']), async(criterion, expect) => { const [value] = await getInferredDomainForCriterion(shell, code, criterion); - assert.deepStrictEqual(value.colnames, expected.colnames, 'column names differ'); - assert.deepStrictEqual(value.cols, expected.cols, 'column count differs'); - assert.deepStrictEqual(value.rows, expected.rows, 'row count differs'); + assert.deepStrictEqual(value.colnames, expect.colnames, 'column names differ'); + assert.deepStrictEqual(value.cols, expect.cols, 'column count differs'); + assert.deepStrictEqual(value.rows, expect.rows, 'row count differs'); }); } -export function testDataFrameDomain( +export function testDataFrameDomainAgainstReal( shell: RShell, code: string, - criterion: SingleSlicingCriterion, + criteria: SlicingCriteria, /** Whether the inferred properties should match exacly the actual properties or can be an over-approximation (defaults to exact for all properties) */ options?: Partial, - name: string = code + name: string | TestLabel = code ): void { const effectiveOptions = { ...DataFrameTestExact, ...options }; - test(name, async()=> { - const [value, node] = await getInferredDomainForCriterion(shell, code, criterion); - const lineNumber = getRangeEnd(node.location)?.[0]; + test(decorateLabelContext(name, ['absint']), async()=> { + const testEntries: CriterionTestEntry[] = []; + + for(const criterion of criteria) { + const [value, node] = await getInferredDomainForCriterion(shell, code, criterion); + const lineNumber = getRangeEnd(node.info.fullRange ?? node.location)?.[0]; - if(lineNumber === undefined) { - throw new Error(`cannot resolve line of criterion ${criterion}`); + if(lineNumber === undefined) { + throw new Error(`cannot resolve line of criterion ${criterion}`); + } + testEntries.push({ criterion, value, node, lineNumber }); } + testEntries.sort((a, b) => b.lineNumber - a.lineNumber); const lines = code.split('\n'); - const outputCode = [ - createCodeForOutput('colnames', criterion, node.content), - createCodeForOutput('cols', criterion, node.content), - createCodeForOutput('rows', criterion, node.content) - ]; - lines.splice(lineNumber + 1, 0, ...outputCode); + + for(const { criterion, node, lineNumber } of testEntries) { + const outputCode = [ + createCodeForOutput('colnames', criterion, node.content), + createCodeForOutput('cols', criterion, node.content), + createCodeForOutput('rows', criterion, node.content) + ]; + lines.splice(lineNumber + 1, 0, ...outputCode); + } const instrumentedCode = lines.join('\n'); shell.clearEnvironment(); const output = await shell.sendCommandWithOutput(instrumentedCode); - const colnames = getRealDomainFromOutput('colnames', criterion, output); - const cols = getRealDomainFromOutput('cols', criterion, output); - const rows = getRealDomainFromOutput('rows', criterion, output); - assertDomainMatching('colnames', value.colnames, colnames, leqColNames, effectiveOptions.colnames); - assertDomainMatching('cols', value.cols, cols, leqInterval, effectiveOptions.cols); - assertDomainMatching('rows', value.rows, rows, leqInterval, effectiveOptions.rows); + for(const { criterion, value } of testEntries) { + const colnames = getRealDomainFromOutput('colnames', criterion, output); + const cols = getRealDomainFromOutput('cols', criterion, output); + const rows = getRealDomainFromOutput('rows', criterion, output); + + assertDomainMatching('colnames', value.colnames, colnames, leqColNames, effectiveOptions.colnames); + assertDomainMatching('cols', value.cols, cols, leqInterval, effectiveOptions.cols); + assertDomainMatching('rows', value.rows, rows, leqInterval, effectiveOptions.rows); + } }); } @@ -123,7 +144,7 @@ async function getInferredDomainForCriterion( shell: RShell, code: string, criterion: SingleSlicingCriterion -): Promise<[DataFrameDomain, RSymbol]> { +): Promise<[DataFrameDomain, RSymbol]> { const result = await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, { parser: shell, request: requestFromInput(code) diff --git a/test/functionality/abstract-interpretation/data-frame/inference.test.ts b/test/functionality/abstract-interpretation/data-frame/inference.test.ts index 527810c6bf..47755dc60e 100644 --- a/test/functionality/abstract-interpretation/data-frame/inference.test.ts +++ b/test/functionality/abstract-interpretation/data-frame/inference.test.ts @@ -1,120 +1,96 @@ import { describe } from 'vitest'; import { withShell } from '../../_helper/shell'; import { ColNamesTop, DataFrameTop } from '../../../../src/abstract-interpretation/data-frame/domain'; -import { testDataFrameDomain, assertDataFrameDomain, DomainMatchingType, DataFrameTestOverapproximation } from './data-frame'; +import { testDataFrameDomainAgainstReal, assertDataFrameDomain, DomainMatchingType, DataFrameTestOverapproximation } from './data-frame'; describe.sequential('Data Frame Abstract Interpretation', withShell(shell => { assertDataFrameDomain( shell, 'df <- data.frame(id = 1:5, age = c(25, 32, 35, 40, 45), score = c(90, 85, 88, 92, 95), row.names = NULL)', - '1@df', - { - colnames: ['id', 'age', 'score'], - cols: [3, 3], - rows: [5, 5] - } + [['1@df', { colnames: ['id', 'age', 'score'], cols: [3, 3], rows: [5, 5] }]] ); - testDataFrameDomain( + testDataFrameDomainAgainstReal( shell, 'df <- data.frame(id = 1:5, age = c(25, 32, 35, 40, 45), score = c(90, 85, 88, 92, 95), row.names = NULL)', - '1@df' + ['1@df'] ); assertDataFrameDomain( shell, 'df <- data.frame(id = c(1, 2, 3, 5, 6, 7), category = c("A", "B", "A", "A", "B", "B"))', - '1@df', - { - colnames: ['id', 'category'], - cols: [2, 2], - rows: [6, 6] - } + [['1@df', { colnames: ['id', 'category'], cols: [2, 2], rows: [6, 6] }]] ); - testDataFrameDomain( + testDataFrameDomainAgainstReal( shell, 'df <- data.frame(id = c(1, 2, 3, 5, 6, 7), category = c("A", "B", "A", "A", "B", "B"))', - '1@df' + ['1@df'] ); assertDataFrameDomain( shell, 'df <- data.frame(c(1, 2, 3:5, c(6, 7, c(8, 9))), c("a", "b", "c"))', - '1@df', - { - colnames: ColNamesTop, - cols: [2, 2], - rows: [9, 9] - } + [['1@df', { colnames: ColNamesTop, cols: [2, 2], rows: [9, 9] }]] ); - testDataFrameDomain( + testDataFrameDomainAgainstReal( shell, 'df <- data.frame(c(1, 2, 3:5, c(6, 7, c(8, 9))), c("a", "b", "c"))', - '1@df', + ['1@df'], { colnames: DomainMatchingType.Overapproximation } ); assertDataFrameDomain( shell, 'df <- data.frame()', - '1@df', - { - colnames: [], - cols: [0, 0], - rows: [0, 0] - } + [['1@df', { colnames: [], cols: [0, 0], rows: [0, 0] }]] ); - testDataFrameDomain( + testDataFrameDomainAgainstReal( shell, 'df <- data.frame()', - '1@df' + ['1@df'] ); assertDataFrameDomain( shell, 'df1 <- data.frame(id = 1:5); df2 <- df1', - '1@df2', - { - colnames: ['id'], - cols: [1, 1], - rows: [5, 5] - } + [ + ['1@df1', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['1@df2', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }] + ] ); - testDataFrameDomain( + testDataFrameDomainAgainstReal( shell, 'df1 <- data.frame(id = 1:5); df2 <- df1', - '1@df2' + ['1@df1', '1@df2'] ); assertDataFrameDomain( shell, 'df <- read.csv("test.csv")', - '1@df', - DataFrameTop + [['1@df', DataFrameTop]] ); - testDataFrameDomain( + testDataFrameDomainAgainstReal( shell, 'df <- read.csv(text = "id,age\\n1,30\\n2,50\\n3,45")', - '1@df', + ['1@df'], DataFrameTestOverapproximation ); assertDataFrameDomain( shell, 'df <- eval(parse(text = "data.frame()"))', - '1@df', - DataFrameTop + [['1@df', DataFrameTop]] ); - testDataFrameDomain( + testDataFrameDomainAgainstReal( shell, 'df <- eval(parse(text = "data.frame()"))', - '1@df', + ['1@df'], DataFrameTestOverapproximation ); })); From be220a385a2d74abe448743f05c0a6655b759c62 Mon Sep 17 00:00:00 2001 From: Oliver Date: Tue, 18 Mar 2025 13:11:15 +0100 Subject: [PATCH 07/11] feat: apply semantics also to data frame operation arguments --- .../data-frame/absint-info.ts | 2 +- .../data-frame/semantics.ts | 43 ++++++++------- .../data-frame/inference.test.ts | 52 +++++++++++++++++++ 3 files changed, 78 insertions(+), 19 deletions(-) diff --git a/src/abstract-interpretation/data-frame/absint-info.ts b/src/abstract-interpretation/data-frame/absint-info.ts index 06d62db241..9477ff4472 100644 --- a/src/abstract-interpretation/data-frame/absint-info.ts +++ b/src/abstract-interpretation/data-frame/absint-info.ts @@ -11,7 +11,7 @@ export interface DataFrameOperation { interface DataFrameInfo { type: string; - domain?: Map + domain?: ReadonlyMap } export interface DataFrameAssignmentInfo extends DataFrameInfo { diff --git a/src/abstract-interpretation/data-frame/semantics.ts b/src/abstract-interpretation/data-frame/semantics.ts index 1803745139..5dd04c657f 100644 --- a/src/abstract-interpretation/data-frame/semantics.ts +++ b/src/abstract-interpretation/data-frame/semantics.ts @@ -6,7 +6,7 @@ import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-i import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; import type { DataFrameDomain } from './domain'; import { DataFrameTop, joinDataFrames } from './domain'; -import type { AbstractInterpretationInfo, DataFrameAssignmentInfo, DataFrameExpressionInfo } from './absint-info'; +import type { AbstractInterpretationInfo, DataFrameAssignmentInfo, DataFrameExpressionInfo, DataFrameOperation } from './absint-info'; import { DataFrameSemanticsMapper } from './expression-semantics'; export function applySemantics( @@ -22,6 +22,8 @@ export function applySemantics( dataFrameDomain = applyExpressionSemantics(node, domain, resolveInfo); } else if(node.type === RType.FunctionCall && node.named) { dataFrameDomain = applySemantics(node.functionName, domain, resolveInfo); + } else if(node.type === RType.Argument && node.value !== undefined) { + dataFrameDomain = applySemantics(node.value, domain, resolveInfo); } else if(node.type === RType.Symbol && resolveInfo.environment !== undefined) { const identifiers = resolveByName(node.content, resolveInfo.environment); const values = identifiers?.map(id => domain.get(id.nodeId) ?? DataFrameTop); @@ -62,31 +64,36 @@ function applyExpressionSemantics( let dataFrameDomain = DataFrameTop; for(const operation of node.info.dataFrame.operations) { - if(operation.operand === undefined) { - const semanticsApplier = DataFrameSemanticsMapper[operation.operation]; - dataFrameDomain = semanticsApplier(dataFrameDomain, operation, resolveInfo); - } else { - const operand = resolveInfo.idMap?.get(operation.operand); - const operandDomain = operand ? applySemantics(operand, domain, resolveInfo) : DataFrameTop; - const semanticsApplier = DataFrameSemanticsMapper[operation.operation]; - dataFrameDomain = semanticsApplier(operandDomain, operation, resolveInfo); + const operand = operation.operand ? resolveInfo.idMap?.get(operation.operand) : undefined; + const operandDomain = operand ? applySemantics(operand, domain, resolveInfo) : undefined; + applyArgumentSemantics(operation, domain, resolveInfo); + const semanticsApplier = DataFrameSemanticsMapper[operation.operation]; + dataFrameDomain = semanticsApplier(operandDomain ?? dataFrameDomain, operation, resolveInfo); - if(operand !== undefined && operation.modify) { - let origins = [operand.info.id]; + if(operand !== undefined && operation.modify) { + let origins = [operand.info.id]; - if(operand.type === RType.Symbol && resolveInfo.environment !== undefined) { - const identifiers = resolveByName(operand.content, resolveInfo.environment); - origins = identifiers?.map(id => id.nodeId) ?? origins; - } - for(const origin of origins) { - domain.set(origin, dataFrameDomain); - } + if(operand.type === RType.Symbol && resolveInfo.environment !== undefined) { + const identifiers = resolveByName(operand.content, resolveInfo.environment); + origins = identifiers?.map(id => id.nodeId) ?? origins; } + origins.forEach(origin => domain.set(origin, dataFrameDomain)); } } return dataFrameDomain; } +function applyArgumentSemantics( + operation: DataFrameOperation, + domain: Map, + resolveInfo : ResolveInfo +): void { + operation.arguments + .map(arg => arg ? resolveInfo.idMap?.get(arg) : undefined) + .filter(arg => arg !== undefined) + .forEach(arg => applySemantics(arg, domain, resolveInfo)); +} + function isAssignment( node: RNode ): node is RNode { diff --git a/test/functionality/abstract-interpretation/data-frame/inference.test.ts b/test/functionality/abstract-interpretation/data-frame/inference.test.ts index 527810c6bf..8e85d2cd11 100644 --- a/test/functionality/abstract-interpretation/data-frame/inference.test.ts +++ b/test/functionality/abstract-interpretation/data-frame/inference.test.ts @@ -117,4 +117,56 @@ describe.sequential('Data Frame Abstract Interpretation', withShell(shell => { '1@df', DataFrameTestOverapproximation ); + + assertDataFrameDomain( + shell, + 'df <- data.frame(id = 1:3, type = c("A", "B", "C"))\ndf <- data.frame()\nprint(df)', + '3@df', + { + colnames: [], + cols: [0, 0], + rows: [0, 0] + } + ); + + testDataFrameDomain( + shell, + 'df <- data.frame(id = 1:3, type = c("A", "B", "C"))\ndf <- data.frame()\nprint(df)', + '3@df' + ); + + assertDataFrameDomain( + shell, + 'df <- data.frame(id = 1:3, type = c("A", "B", "C"))\nprint(df <- data.frame())\nprint(df)', + '3@df', + { + colnames: [], + cols: [0, 0], + rows: [0, 0] + } + ); + + testDataFrameDomain( + shell, + 'df <- data.frame(id = 1:3, type = c("A", "B", "C"))\nprint(df <- data.frame())\nprint(df)', + '3@df' + ); + + assertDataFrameDomain( + shell, + 'df <- 1:3 |> data.frame(type = c("A", "B", "C"))', + '1@df', + { + colnames: ColNamesTop, + cols: [2, 2], + rows: [3, 3] + } + ); + + testDataFrameDomain( + shell, + 'df <- 1:3 |> data.frame(type = c("A", "B", "C"))', + '1@df', + { colnames: DomainMatchingType.Overapproximation } + ); })); From 57f04e2df8f057988145ecc079009e16fee0b73a Mon Sep 17 00:00:00 2001 From: Oliver Date: Fri, 4 Apr 2025 07:36:27 +0000 Subject: [PATCH 08/11] Improve performance of data frame absint tests (#1490) * test: parse code only once and check criteria on result * test: support parser selection for data frame assert tests --- .../data-frame/data-frame.ts | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/test/functionality/abstract-interpretation/data-frame/data-frame.ts b/test/functionality/abstract-interpretation/data-frame/data-frame.ts index ea04586dec..8273c93c7f 100644 --- a/test/functionality/abstract-interpretation/data-frame/data-frame.ts +++ b/test/functionality/abstract-interpretation/data-frame/data-frame.ts @@ -1,20 +1,23 @@ -import { assert, test } from 'vitest'; +import { assert, beforeAll, test } from 'vitest'; import type { DataFrameDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; import { leqColNames, leqInterval } from '../../../../src/abstract-interpretation/data-frame/domain'; +import type { AbstractInterpretationInfo } from '../../../../src/abstract-interpretation/data-frame/absint-info'; import { PipelineExecutor } from '../../../../src/core/pipeline-executor'; -import { DEFAULT_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; +import type { TREE_SITTER_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; +import { createDataflowPipeline, DEFAULT_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; import { RType } from '../../../../src/r-bridge/lang-4.x/ast/model/type'; import { requestFromInput } from '../../../../src/r-bridge/retriever'; import type { RShell } from '../../../../src/r-bridge/shell'; import type { SingleSlicingCriterion, SlicingCriteria } from '../../../../src/slicing/criterion/parse'; import { slicingCriterionToId } from '../../../../src/slicing/criterion/parse'; -import { assertUnreachable } from '../../../../src/util/assert'; +import { assertUnreachable, guard, isNotUndefined } from '../../../../src/util/assert'; import { getRangeEnd } from '../../../../src/util/range'; import type { RSymbol } from '../../../../src/r-bridge/lang-4.x/ast/model/nodes/r-symbol'; import { resolveDataFrameValue } from '../../../../src/abstract-interpretation/data-frame/abstract-interpretation'; import { decorateLabelContext, type TestLabel } from '../../_helper/label'; import type { ParentInformation } from '../../../../src/r-bridge/lang-4.x/ast/model/processing/decorate'; -import type { AbstractInterpretationInfo } from '../../../../src/abstract-interpretation/data-frame/absint-info'; +import type { PipelineOutput } from '../../../../src/core/steps/pipeline/pipeline'; +import type { KnownParser } from '../../../../src/r-bridge/parser'; export enum DomainMatchingType { Exact = 'exact', @@ -44,13 +47,20 @@ interface CriterionTestEntry { } export function assertDataFrameDomain( - shell: RShell, + parser: KnownParser, code: string, expected: [SingleSlicingCriterion, DataFrameDomain][], name: string | TestLabel = code ) { - test.each(expected)( decorateLabelContext(name, ['absint']), async(criterion, expect) => { - const [value] = await getInferredDomainForCriterion(shell, code, criterion); + let result: PipelineOutput | undefined; + + beforeAll(async() => { + result = await createDataflowPipeline(parser, { request: requestFromInput(code) }).allRemainingSteps(); + }); + + test.each(expected)(decorateLabelContext(name, ['absint']), (criterion, expect) => { + guard(isNotUndefined(result), 'Result cannot be undefined'); + const [value] = getInferredDomainForCriterion(result, criterion); assert.deepStrictEqual(value.colnames, expect.colnames, 'column names differ'); assert.deepStrictEqual(value.cols, expect.cols, 'column count differs'); @@ -67,11 +77,17 @@ export function testDataFrameDomainAgainstReal( name: string | TestLabel = code ): void { const effectiveOptions = { ...DataFrameTestExact, ...options }; + test(decorateLabelContext(name, ['absint']), async()=> { + const result = await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, { + parser: shell, + request: requestFromInput(code) + }).allRemainingSteps(); + const testEntries: CriterionTestEntry[] = []; for(const criterion of criteria) { - const [value, node] = await getInferredDomainForCriterion(shell, code, criterion); + const [value, node] = getInferredDomainForCriterion(result, criterion); const lineNumber = getRangeEnd(node.info.fullRange ?? node.location)?.[0]; if(lineNumber === undefined) { @@ -141,16 +157,10 @@ function createCodeForOutput( } } -async function getInferredDomainForCriterion( - shell: RShell, - code: string, +function getInferredDomainForCriterion( + result: PipelineOutput, criterion: SingleSlicingCriterion -): Promise<[DataFrameDomain, RSymbol]> { - const result = await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, { - parser: shell, - request: requestFromInput(code) - }).allRemainingSteps(); - +): [DataFrameDomain, RSymbol] { const idMap = result.dataflow.graph.idMap ?? result.normalize.idMap; const nodeId = slicingCriterionToId(criterion, idMap); const node = idMap.get(nodeId); @@ -172,7 +182,7 @@ function getRealDomainFromOutput( const line = output.find(line => line.startsWith(marker))?.replace(marker, '').trim(); if(line === undefined) { - throw new Error(`cannot parse output of instrumented code for ${type}`); + throw new Error(`cannot parse ${type} output of instrumented code for ${criterion}`); } switch(type) { case 'colnames': { From acdd11f19c3781d33b8a649ad343aa313d3a119e Mon Sep 17 00:00:00 2001 From: Oliver Date: Wed, 16 Apr 2025 18:22:24 +0200 Subject: [PATCH 09/11] Implement fold over CFG for forward traversal (#1530) * feat: basic fold over CFG for forward traversal * feat: add simplified forward-connected control flow graph * feat-fix: correct label for CFG exit point edges * test: unit tests for simple control flow graph * feat: add CFG visitor for fixpoint iteration for data frames * feat-fix: add missing negation in has-changed check * feat-fix: subtract top from colnames should have no effect --- .../data-frame/absint-info.ts | 34 --- .../data-frame/abstract-interpretation.ts | 68 +++-- .../data-frame/domain.ts | 36 ++- .../data-frame/expression-semantics.ts | 40 --- .../data-frame/process/data-frame-access.ts | 57 ---- .../process/data-frame-assignment.ts | 39 --- .../process/data-frame-expression-list.ts | 25 -- .../process/data-frame-function-call.ts | 66 ----- .../data-frame/process/data-frame-pipe.ts | 35 --- .../data-frame/processor.ts | 44 +++ .../data-frame/semantics.ts | 107 ------- src/abstract-interpretation/simple-cfg.ts | 260 ++++++++++++++++++ src/cli/repl/commands/repl-cfg.ts | 20 ++ src/cli/repl/commands/repl-commands.ts | 31 ++- src/dataflow/environments/built-in-config.ts | 2 +- src/dataflow/environments/built-in.ts | 51 +--- src/dataflow/environments/resolve-by-name.ts | 5 +- .../data-frame/data-frame.ts | 18 +- .../data-frame/domain.test.ts | 4 +- .../data-frame/inference.test.ts | 2 +- .../simple-cfg.test.ts | 172 ++++++++++++ 21 files changed, 612 insertions(+), 504 deletions(-) delete mode 100644 src/abstract-interpretation/data-frame/absint-info.ts delete mode 100644 src/abstract-interpretation/data-frame/expression-semantics.ts delete mode 100644 src/abstract-interpretation/data-frame/process/data-frame-access.ts delete mode 100644 src/abstract-interpretation/data-frame/process/data-frame-assignment.ts delete mode 100644 src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts delete mode 100644 src/abstract-interpretation/data-frame/process/data-frame-function-call.ts delete mode 100644 src/abstract-interpretation/data-frame/process/data-frame-pipe.ts create mode 100644 src/abstract-interpretation/data-frame/processor.ts delete mode 100644 src/abstract-interpretation/data-frame/semantics.ts create mode 100644 src/abstract-interpretation/simple-cfg.ts create mode 100644 test/functionality/abstract-interpretation/simple-cfg.test.ts diff --git a/src/abstract-interpretation/data-frame/absint-info.ts b/src/abstract-interpretation/data-frame/absint-info.ts deleted file mode 100644 index 9477ff4472..0000000000 --- a/src/abstract-interpretation/data-frame/absint-info.ts +++ /dev/null @@ -1,34 +0,0 @@ -import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; -import type { DataFrameDomain } from './domain'; -import type { DataFrameOperationName } from './expression-semantics'; - -export interface DataFrameOperation { - operation: DataFrameOperationName, - operand: NodeId | undefined, - arguments: (NodeId | undefined)[], - modify?: boolean -} - -interface DataFrameInfo { - type: string; - domain?: ReadonlyMap -} - -export interface DataFrameAssignmentInfo extends DataFrameInfo { - type: 'assignment', - identifier: NodeId, - expression: NodeId -} - -export interface DataFrameExpressionInfo extends DataFrameInfo { - type: 'expression', - operations: DataFrameOperation[] -} - -export interface DataFrameOtherInfo extends DataFrameInfo { - type: 'other' -} - -export interface AbstractInterpretationInfo { - dataFrame?: DataFrameAssignmentInfo | DataFrameExpressionInfo | DataFrameOtherInfo -} diff --git a/src/abstract-interpretation/data-frame/abstract-interpretation.ts b/src/abstract-interpretation/data-frame/abstract-interpretation.ts index ac772ff91a..b2e1ba723f 100644 --- a/src/abstract-interpretation/data-frame/abstract-interpretation.ts +++ b/src/abstract-interpretation/data-frame/abstract-interpretation.ts @@ -1,23 +1,49 @@ -import type { REnvironmentInformation } from '../../dataflow/environments/environment'; -import { resolveByName } from '../../dataflow/environments/resolve-by-name'; -import type { RSymbol } from '../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; -import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; -import type { AbstractInterpretationInfo } from './absint-info'; -import type { DataFrameDomain } from './domain'; -import { DataFrameTop, joinDataFrames } from './domain'; - -export function resolveDataFrameValue( - node: RSymbol, - environment: REnvironmentInformation, - identifier: string = node.content -): DataFrameDomain { - const domain = node.info.dataFrame?.domain; - - if(domain === undefined) { - return DataFrameTop; - } - const identifiers = resolveByName(identifier, environment); - const values = identifiers?.map(id => domain.get(id.nodeId) ?? DataFrameTop); +import type { DataflowGraph } from '../../dataflow/graph/graph'; +import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; +import { RFalse, RTrue } from '../../r-bridge/lang-4.x/convert-values'; +import { guard } from '../../util/assert'; +import type { CfgEdge } from '../../util/cfg/cfg'; +import type { SimpleControlFlowGraph, SimpleControlFlowInformation } from '../simple-cfg'; +import type { DataFrameDomain, DataFrameStateDomain } from './domain'; +import { equalDataFrameState } from './domain'; +import { processDataFrameNode } from './processor'; + +export function performDataFrameAbsint(cfg: SimpleControlFlowInformation, dfg: DataflowGraph) { + const visited: Set = new Set(); + + const visitor = (cfg: SimpleControlFlowGraph, nodeId: NodeId, domain: DataFrameStateDomain) => { + const node = dfg.idMap?.get(nodeId); + guard(node !== undefined, 'Node must not be undefined'); + + const result = processDataFrameNode(node, domain, dfg); + const equal = 'FD' in result ? { + 'FD': equalDataFrameState(domain, result['FD']), + [RTrue]: equalDataFrameState(domain, result[RTrue]), + [RFalse]: equalDataFrameState(domain, result[RFalse]) + } : equalDataFrameState(domain, result); + + const hasChanged = (edge: CfgEdge) => typeof equal === 'object' ? !equal[edge.label === 'FD' ? edge.label : edge.when] : !equal; + const getDomain = (edge: CfgEdge) => 'FD' in result ? result[edge.label === 'FD' ? edge.label : edge.when] : result; - return values ? joinDataFrames(...values) : DataFrameTop; + const successors = cfg.edges().get(nodeId)?.entries() + .filter(([successor, edge]) => !visited.has(successor) || hasChanged(edge)) + .map<[NodeId, DataFrameStateDomain]>(([successor, edge]) => [successor, getDomain(edge)]) + .toArray(); + + successors?.forEach(([successor]) => visited.add(successor)); + + return successors ?? []; + }; + foldGraph(cfg.graph, cfg.entryPoints.map((entry) => [entry, new Map()]), visitor); +} + +function foldGraph( + cfg: SimpleControlFlowGraph, + nodes: [NodeId, DataFrameStateDomain][], + visitor: (cfg: SimpleControlFlowGraph, node: NodeId, domain: DataFrameStateDomain) => [NodeId, DataFrameStateDomain][] +): void { + for(const [node, domain] of nodes) { + const successors = visitor(cfg, node, domain); + foldGraph(cfg, successors, visitor); + } } diff --git a/src/abstract-interpretation/data-frame/domain.ts b/src/abstract-interpretation/data-frame/domain.ts index e3a97b6d50..87a6b81703 100644 --- a/src/abstract-interpretation/data-frame/domain.ts +++ b/src/abstract-interpretation/data-frame/domain.ts @@ -1,3 +1,6 @@ +import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; +import { setEquals } from '../../util/set'; + type Interval = [number, number]; export const IntervalBottom = 'bottom'; @@ -26,6 +29,12 @@ export const DataFrameTop: DataFrameDomain = { rows: IntervalTop }; +export type DataFrameStateDomain = Map; + +export function equalColNames(X1: ColNamesDomain, X2: ColNamesDomain): boolean { + return X1 === X2 || (X1 !== ColNamesTop && setEquals(new Set(X1), new Set(X2))); +} + export function leqColNames(X1: ColNamesDomain, X2: ColNamesDomain): boolean { return X2 === ColNamesTop || (X1 !== ColNamesTop && new Set(X1).isSubsetOf(new Set(X2))); } @@ -51,15 +60,19 @@ export function meetColNames(X1: ColNamesDomain, X2: ColNamesDomain): ColNamesDo } export function subtractColNames(X1: ColNamesDomain, X2: ColNamesDomain): ColNamesDomain { - if(X2 === ColNamesTop) { - return ColNamesBottom; - } else if(X1 === ColNamesTop) { + if(X1 === ColNamesTop) { return ColNamesTop; + } else if(X2 === ColNamesTop) { + return X1; } else { return Array.from(new Set(X1).difference(new Set(X2))); } } +export function equalInterval(X1: IntervalDomain, X2: IntervalDomain): boolean { + return X1 === X2 || (X1 !== IntervalBottom && X1[0] === X2[0] && X1[1] === X2[1]); +} + export function leqInterval(X1: IntervalDomain, X2: IntervalDomain): boolean { return X1 === IntervalBottom || (X2 !== IntervalBottom && X2[0] <= X1[0] && X1[1] <= X2[1]); } @@ -111,3 +124,20 @@ export function meetDataFrames(...values: DataFrameDomain[]) { } return value; } + +export function equalDataFrameDomain(X1: DataFrameDomain, X2: DataFrameDomain) { + return equalColNames(X1.colnames, X2.colnames) && equalInterval(X1.cols, X2.cols) && equalInterval(X1.rows, X2.rows); +} + +export function equalDataFrameState(R1: DataFrameStateDomain, R2: DataFrameStateDomain) { + if(R1.size !== R2.size) { + return false; + } + for(const [key, value] of R1) { + const other = R2.get(key); + if(other === undefined || !equalDataFrameDomain(value, other)) { + return false; + } + } + return true; +} diff --git a/src/abstract-interpretation/data-frame/expression-semantics.ts b/src/abstract-interpretation/data-frame/expression-semantics.ts deleted file mode 100644 index c535cc67e3..0000000000 --- a/src/abstract-interpretation/data-frame/expression-semantics.ts +++ /dev/null @@ -1,40 +0,0 @@ -import type { ResolveInfo } from '../../dataflow/environments/resolve-by-name'; -import type { DataFrameOperation } from './absint-info'; -import { ColNamesBottom, ColNamesTop, DataFrameTop, IntervalTop, joinColNames, type ColNamesDomain, type DataFrameDomain } from './domain'; -import { resolveIdToArgName, resolveIdToArgVectorLength, resolveIdToArgValueSymbolName } from './resolve-args'; - -export const DataFrameSemanticsMapper = { - 'create': applyCreateSemantics, - 'accessCol': applyAccessColSemantics, - 'unknown': applyUnknownSemantics -} as const satisfies Record; - -export type DataFrameOperationName = keyof typeof DataFrameSemanticsMapper; -type DataFrameSemanticsApplier = (value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo) => DataFrameDomain; - -function applyCreateSemantics(value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo): DataFrameDomain { - const argNames = event.arguments.map(arg => arg ? resolveIdToArgName(arg, info) : undefined); - const argLengths = event.arguments.map(arg => arg ? resolveIdToArgVectorLength(arg, info) : undefined); - const colnames = argNames.some(arg => arg === undefined) ? ColNamesTop : argNames as ColNamesDomain; - const rowCount = argLengths.some(arg => arg === undefined) ? undefined : Math.max(...argLengths as number[], 0); - - return { - colnames: colnames, - cols: [event.arguments.length, event.arguments.length], - rows: rowCount !== undefined ? [rowCount, rowCount] : IntervalTop - }; -} - -function applyAccessColSemantics(value: DataFrameDomain, event: DataFrameOperation, info: ResolveInfo): DataFrameDomain { - const argNames = event.arguments.map(arg => arg ? resolveIdToArgValueSymbolName(arg, info) : undefined); - const colnames = argNames.some(arg => arg === undefined) ? ColNamesBottom : argNames as ColNamesDomain; - - return { - ...value, - colnames: joinColNames(value.colnames, colnames) - }; -} - -function applyUnknownSemantics(): DataFrameDomain { - return DataFrameTop; -} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-access.ts b/src/abstract-interpretation/data-frame/process/data-frame-access.ts deleted file mode 100644 index 1d8c548869..0000000000 --- a/src/abstract-interpretation/data-frame/process/data-frame-access.ts +++ /dev/null @@ -1,57 +0,0 @@ -import type { ForceArguments } from '../../../dataflow/internal/process/functions/call/common'; -import type { DataflowProcessorInformation } from '../../../dataflow/processor'; -import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; -import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; -import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id'; -import type { AbstractInterpretationInfo } from '../absint-info'; - -export function processDataFrameAccess( - name: RSymbol, - args: readonly RFunctionArgument[], - rootId: NodeId, - data: DataflowProcessorInformation, - config: { treatIndicesAsString: boolean } & ForceArguments -) { - if(config.treatIndicesAsString) { - processDataFrameStringBasedAccess(name, args); - } else { - processDataFrameUnknownAccess(name, args); - } -} - -function processDataFrameStringBasedAccess( - name: RSymbol, - args: readonly RFunctionArgument[] -) { - const leftArg = args[0] !== EmptyArgument ? args[0] : undefined; - const rightArg = args[1] !== EmptyArgument ? args[1] : undefined; - - if(args.length === 2 && leftArg?.value !== undefined && rightArg !== undefined) { - name.info.dataFrame = { - type: 'expression', - operations: [{ - operation: 'accessCol', - operand: leftArg.value.info.id, - arguments: [rightArg.info.id] - }] - }; - } else { - processDataFrameUnknownAccess(name, args); - } -} - -function processDataFrameUnknownAccess( - name: RSymbol, - args: readonly RFunctionArgument[] -) { - name.info.dataFrame = { - type: 'expression', - operations: [{ - operation: 'unknown', - operand: args[0] !== EmptyArgument ? args[0]?.value?.info.id : undefined, - arguments: args.slice(1).map(arg => arg !== EmptyArgument ? arg.info.id : undefined) - }] - }; -} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-assignment.ts b/src/abstract-interpretation/data-frame/process/data-frame-assignment.ts deleted file mode 100644 index fa1bfcdd73..0000000000 --- a/src/abstract-interpretation/data-frame/process/data-frame-assignment.ts +++ /dev/null @@ -1,39 +0,0 @@ -import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; -import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; -import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; -import type { AbstractInterpretationInfo } from '../absint-info'; - -export function processDataFrameAssignment( - name: RSymbol, - args: readonly RFunctionArgument[] -) { - const leftArg = args[0] !== EmptyArgument ? args[0] : undefined; - const rightArg = args[1] !== EmptyArgument ? args[1] : undefined; - - if(args.length === 2 && leftArg?.value?.type === RType.Symbol && rightArg?.value !== undefined) { - name.info.dataFrame = { - type: 'assignment', - identifier: leftArg.value.info.id, - expression: rightArg.value.info.id - }; - } else { - processDataFrameUnknownAssignment(name, args); - } -} - -function processDataFrameUnknownAssignment( - name: RSymbol, - args: readonly RFunctionArgument[] -) { - name.info.dataFrame = { - type: 'expression', - operations: [{ - operation: 'unknown', - operand: args[0] !== EmptyArgument ? args[0]?.value?.info.id : undefined, - arguments: args.slice(1).map(arg => arg !== EmptyArgument ? arg.info.id : undefined), - modify: true - }] - }; -} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts b/src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts deleted file mode 100644 index cb18997eaf..0000000000 --- a/src/abstract-interpretation/data-frame/process/data-frame-expression-list.ts +++ /dev/null @@ -1,25 +0,0 @@ -import type { DataflowProcessorInformation } from '../../../dataflow/processor'; -import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; -import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; -import type { NodeId } from '../../../r-bridge/lang-4.x/ast/model/processing/node-id'; -import type { DataFrameDomain } from '../domain'; -import type { AbstractInterpretationInfo } from '../absint-info'; -import { applySemantics } from '../semantics'; - -export function processDataFrameExpressionList( - name: RSymbol, - args: readonly RFunctionArgument[], - rootId: NodeId, - data: DataflowProcessorInformation -) { - const resolveInfo = { environment: data.environment, idMap: data.completeAst.idMap, full: true }; - const domain: Map = new Map(); - - for(const arg of args) { - if(arg !== EmptyArgument && arg.value !== undefined) { - applySemantics(arg.value, domain, resolveInfo); - } - } -} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-function-call.ts b/src/abstract-interpretation/data-frame/process/data-frame-function-call.ts deleted file mode 100644 index 3c14b81960..0000000000 --- a/src/abstract-interpretation/data-frame/process/data-frame-function-call.ts +++ /dev/null @@ -1,66 +0,0 @@ -import type { BuiltInIdentifierProcessorDecorator } from '../../../dataflow/environments/built-in'; -import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; -import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; -import type { AbstractInterpretationInfo } from '../absint-info'; - -const DataFrameFunctionMapper = { - 'data.frame': processDataFrameCreate -} as const satisfies Record>; - -const DataFrameSpecialArgumentsMapper: Partial> = { - 'data.frame': ['row.names', 'check.rows', 'check.names', 'fix.empty.names', 'stringsAsFactors'] -}; - -type DataFrameFunction = keyof typeof DataFrameFunctionMapper; - -export function processDataFrameFunctionCall( - name: RSymbol, - args: readonly RFunctionArgument[] -) { - if(name.content in DataFrameFunctionMapper) { - const functionName = name.content as DataFrameFunction; - const functionProcessor = DataFrameFunctionMapper[functionName]; - functionProcessor(name, getEffectiveArgs(functionName, args)); - } else { - processDataFrameUnknownCall(name, args); - } -} - -function processDataFrameCreate( - name: RSymbol, - args: readonly RFunctionArgument[] -) { - name.info.dataFrame = { - type: 'expression', - operations: [{ - operation: 'create', - operand: undefined, - arguments: args.map(arg => arg !== EmptyArgument ? arg.info.id : undefined) - }] - }; -} - -function processDataFrameUnknownCall( - name: RSymbol, - args: readonly RFunctionArgument[] -) { - name.info.dataFrame = { - type: 'expression', - operations: [{ - operation: 'unknown', - operand: undefined, - arguments: args.map(arg => arg !== EmptyArgument ? arg.info.id : undefined) - }] - }; -} - -function getEffectiveArgs( - funct: DataFrameFunction, - args: readonly RFunctionArgument[] -): readonly RFunctionArgument[] { - const specialArgs = DataFrameSpecialArgumentsMapper[funct] ?? []; - - return args.filter(arg => arg === EmptyArgument || arg.name === undefined || !specialArgs.includes(arg.name.content)); -} diff --git a/src/abstract-interpretation/data-frame/process/data-frame-pipe.ts b/src/abstract-interpretation/data-frame/process/data-frame-pipe.ts deleted file mode 100644 index 813bcd1284..0000000000 --- a/src/abstract-interpretation/data-frame/process/data-frame-pipe.ts +++ /dev/null @@ -1,35 +0,0 @@ -import { EmptyArgument, type RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; -import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; -import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; -import type { AbstractInterpretationInfo } from '../absint-info'; -import { processDataFrameFunctionCall } from './data-frame-function-call'; - -export function processDataFramePipe( - name: RSymbol, - args: readonly RFunctionArgument[] -): void { - const leftArg = args[0] !== EmptyArgument ? args[0] : undefined; - const rightArg = args[1] !== EmptyArgument ? args[1] : undefined; - - if(leftArg !== undefined && rightArg?.value?.type === RType.FunctionCall && rightArg.value.named) { - processDataFrameFunctionCall(rightArg.value.functionName, [leftArg, ...rightArg.value.arguments]); - name.info.dataFrame = rightArg.value.functionName.info.dataFrame; - } else { - processDataFrameUnknownPipe(name, args); - } -} - -function processDataFrameUnknownPipe( - name: RSymbol, - args: readonly RFunctionArgument[] -) { - name.info.dataFrame = { - type: 'expression', - operations: [{ - operation: 'unknown', - operand: args[0] !== EmptyArgument ? args[0]?.value?.info.id : undefined, - arguments: args.slice(1).map(arg => arg !== EmptyArgument ? arg.info.id : undefined) - }] - }; -} diff --git a/src/abstract-interpretation/data-frame/processor.ts b/src/abstract-interpretation/data-frame/processor.ts new file mode 100644 index 0000000000..a5568af3ce --- /dev/null +++ b/src/abstract-interpretation/data-frame/processor.ts @@ -0,0 +1,44 @@ +import { type DataflowGraph } from '../../dataflow/graph/graph'; +import type { RNode } from '../../r-bridge/lang-4.x/ast/model/model'; +import type { RBinaryOp } from '../../r-bridge/lang-4.x/ast/model/nodes/r-binary-op'; +import type { RFunctionCall } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import { EmptyArgument } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; +import type { RFalse, RTrue } from '../../r-bridge/lang-4.x/convert-values'; +import type { DataFrameStateDomain } from './domain'; + +export type ConditionalDataFrameState = Record<'FD' | typeof RTrue | typeof RFalse, DataFrameStateDomain>; + +export function processDataFrameNode( + node: RNode, + domain: DataFrameStateDomain, + dfg: DataflowGraph +): DataFrameStateDomain | ConditionalDataFrameState { + switch(node.type) { + case RType.BinaryOp: + return processBinaryOp(node, domain, dfg); + case RType.FunctionCall: + return processFunctionCall(node, domain, dfg); + default: + return domain; + } +} + +function processBinaryOp( + node: RBinaryOp, + domain: DataFrameStateDomain, + dfg: DataflowGraph +): DataFrameStateDomain | ConditionalDataFrameState { + console.log(node, dfg.get(node.info.id), { lhs: dfg.get(node.lhs.info.id), rhs: dfg.get(node.rhs.info.id) }); + return domain; +} + +function processFunctionCall( + node: RFunctionCall, + domain: DataFrameStateDomain, + dfg: DataflowGraph +): DataFrameStateDomain | ConditionalDataFrameState { + console.log(node, dfg.get(node.info.id), node.arguments.map(arg => arg !== EmptyArgument && arg.value ? dfg.get(arg.value?.info.id) : undefined)); + return domain; +} diff --git a/src/abstract-interpretation/data-frame/semantics.ts b/src/abstract-interpretation/data-frame/semantics.ts deleted file mode 100644 index 5dd04c657f..0000000000 --- a/src/abstract-interpretation/data-frame/semantics.ts +++ /dev/null @@ -1,107 +0,0 @@ -import type { ResolveInfo } from '../../dataflow/environments/resolve-by-name'; -import { resolveByName } from '../../dataflow/environments/resolve-by-name'; -import type { RNode } from '../../r-bridge/lang-4.x/ast/model/model'; -import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; -import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; -import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; -import type { DataFrameDomain } from './domain'; -import { DataFrameTop, joinDataFrames } from './domain'; -import type { AbstractInterpretationInfo, DataFrameAssignmentInfo, DataFrameExpressionInfo, DataFrameOperation } from './absint-info'; -import { DataFrameSemanticsMapper } from './expression-semantics'; - -export function applySemantics( - node: RNode, - domain: Map, - resolveInfo : ResolveInfo -): DataFrameDomain { - let dataFrameDomain = DataFrameTop; - - if(isAssignment(node)) { - dataFrameDomain = applyAssignmentSemantics(node, domain, resolveInfo); - } else if(isExpression(node)) { - dataFrameDomain = applyExpressionSemantics(node, domain, resolveInfo); - } else if(node.type === RType.FunctionCall && node.named) { - dataFrameDomain = applySemantics(node.functionName, domain, resolveInfo); - } else if(node.type === RType.Argument && node.value !== undefined) { - dataFrameDomain = applySemantics(node.value, domain, resolveInfo); - } else if(node.type === RType.Symbol && resolveInfo.environment !== undefined) { - const identifiers = resolveByName(node.content, resolveInfo.environment); - const values = identifiers?.map(id => domain.get(id.nodeId) ?? DataFrameTop); - dataFrameDomain = values ? joinDataFrames(...values) : DataFrameTop; - } - node.info.dataFrame ??= { type: 'other' }; - node.info.dataFrame.domain = new Map(domain); - - return dataFrameDomain; -} - -function applyAssignmentSemantics( - node: RNode, - domain: Map, - resolveInfo : ResolveInfo -): DataFrameDomain { - let dataFrameDomain = DataFrameTop; - - const identifier = resolveInfo.idMap?.get(node.info.dataFrame.identifier); - const expression = resolveInfo.idMap?.get(node.info.dataFrame.expression); - - if(identifier?.type === RType.Symbol && expression !== undefined) { - dataFrameDomain = applySemantics(expression, domain, resolveInfo); - domain.set(identifier.info.id, dataFrameDomain); - } - if(identifier !== undefined) { - identifier.info.dataFrame ??= { type: 'other' }; - identifier.info.dataFrame.domain = new Map(domain); - } - return dataFrameDomain; -} - -function applyExpressionSemantics( - node: RNode, - domain: Map, - resolveInfo : ResolveInfo -): DataFrameDomain { - let dataFrameDomain = DataFrameTop; - - for(const operation of node.info.dataFrame.operations) { - const operand = operation.operand ? resolveInfo.idMap?.get(operation.operand) : undefined; - const operandDomain = operand ? applySemantics(operand, domain, resolveInfo) : undefined; - applyArgumentSemantics(operation, domain, resolveInfo); - const semanticsApplier = DataFrameSemanticsMapper[operation.operation]; - dataFrameDomain = semanticsApplier(operandDomain ?? dataFrameDomain, operation, resolveInfo); - - if(operand !== undefined && operation.modify) { - let origins = [operand.info.id]; - - if(operand.type === RType.Symbol && resolveInfo.environment !== undefined) { - const identifiers = resolveByName(operand.content, resolveInfo.environment); - origins = identifiers?.map(id => id.nodeId) ?? origins; - } - origins.forEach(origin => domain.set(origin, dataFrameDomain)); - } - } - return dataFrameDomain; -} - -function applyArgumentSemantics( - operation: DataFrameOperation, - domain: Map, - resolveInfo : ResolveInfo -): void { - operation.arguments - .map(arg => arg ? resolveInfo.idMap?.get(arg) : undefined) - .filter(arg => arg !== undefined) - .forEach(arg => applySemantics(arg, domain, resolveInfo)); -} - -function isAssignment( - node: RNode -): node is RNode { - return node.info.dataFrame?.type === 'assignment'; -} - -function isExpression( - node: RNode -): node is RNode { - return node.info.dataFrame?.type === 'expression'; -} diff --git a/src/abstract-interpretation/simple-cfg.ts b/src/abstract-interpretation/simple-cfg.ts new file mode 100644 index 0000000000..56a72cbe1a --- /dev/null +++ b/src/abstract-interpretation/simple-cfg.ts @@ -0,0 +1,260 @@ +import type { RForLoop } from '../r-bridge/lang-4.x/ast/model/nodes/r-for-loop'; +import type { RFunctionCall , EmptyArgument } from '../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; + +import type { RFunctionDefinition } from '../r-bridge/lang-4.x/ast/model/nodes/r-function-definition'; +import type { RRepeatLoop } from '../r-bridge/lang-4.x/ast/model/nodes/r-repeat-loop'; +import type { RWhileLoop } from '../r-bridge/lang-4.x/ast/model/nodes/r-while-loop'; +import type { ParentInformation, NormalizedAst, RNodeWithParent } from '../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { FoldFunctions } from '../r-bridge/lang-4.x/ast/model/processing/fold'; +import { foldAst } from '../r-bridge/lang-4.x/ast/model/processing/fold'; +import type { NodeId } from '../r-bridge/lang-4.x/ast/model/processing/node-id'; +import { RoleInParent } from '../r-bridge/lang-4.x/ast/model/processing/role'; +import type { RType } from '../r-bridge/lang-4.x/ast/model/type'; +import { RTrue, RFalse } from '../r-bridge/lang-4.x/convert-values'; +import type { CfgEdge, CfgVertex, ControlFlowInformation } from '../util/cfg/cfg'; +import { CfgVertexType, ControlFlowGraph } from '../util/cfg/cfg'; + +export enum SimpleCfgVertexType { + Expression = 'expression', + IfThenElse = 'if-then-else', + ForLoop = 'for-loop', + RepeatLoop = 'repeat-loop', + WhileLoop = 'while-loop', + Break = 'break', + Next = 'next' +} + +export interface SimpleCfgVertex extends CfgVertex { + name: RType, + type: Exclude, + tag: SimpleCfgVertexType +} + +export class SimpleControlFlowGraph extends ControlFlowGraph { + addVertex(vertex: SimpleCfgVertex, rootVertex = true): this { + super.addVertex(vertex, rootVertex); + return this; + } + + vertices(): ReadonlyMap { + return super.vertices() as ReadonlyMap; + } +} + +export interface SimpleControlFlowInformation extends ControlFlowInformation { + graph: SimpleControlFlowGraph +} + +interface SimpleControlFlowInfo { + graph: SimpleControlFlowGraph, + returns: NodeId[], + breaks: NodeId[], + nexts: NodeId[], + entryPoints: NodeId[], + exitPoints: ExitPoint[] +} + +interface ExitPoint { + node: NodeId, + edge: CfgEdge +} + +const cfgFolds: FoldFunctions = { + foldNumber: cfgLeaf, + foldString: cfgLeaf, + foldLogical: cfgLeaf, + foldSymbol: cfgLeaf, + foldAccess: cfgLeaf, + foldBinaryOp: cfgLeaf, + foldPipe: cfgLeaf, + foldUnaryOp: cfgLeaf, + other: { + foldComment: cfgIgnore, + foldLineDirective: cfgIgnore + }, + loop: { + foldFor: cfgFor, + foldRepeat: cfgRepeat, + foldWhile: cfgWhile, + foldBreak: cfgBreak, + foldNext: cfgNext + }, + foldIfThenElse: cfgIfThenElse, + foldExprList: cfgExprList, + functions: { + foldFunctionDefinition: cfgFunctionDefinition, + foldFunctionCall: cfgFunctionCall, + foldParameter: cfgLeaf, + foldArgument: cfgLeaf + } +}; + +export function extractSimpleCFG(ast: NormalizedAst): SimpleControlFlowInformation { + const info = foldAst(ast.ast, cfgFolds); + + return { ...info, exitPoints: info.exitPoints.map(exit => exit.node) }; +} + +function cfgLeaf(leaf: RNodeWithParent): SimpleControlFlowInfo { + // We are only interested in actual expressions in an expression list + if(leaf.info.role === RoleInParent.ExpressionListChild) { + const graph = new SimpleControlFlowGraph(); + const vertex: SimpleCfgVertex = { id: leaf.info.id, name: leaf.type, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }; + graph.addVertex(vertex); + const exitPoints: ExitPoint[] = [{ node: leaf.info.id, edge: { label: 'FD' } }]; + + return { graph, breaks: [], nexts: [], returns: [], entryPoints: [leaf.info.id], exitPoints }; + } + return cfgIgnore(leaf); +} + +function cfgBreak(leaf: RNodeWithParent): SimpleControlFlowInfo { + const graph = new SimpleControlFlowGraph(); + const vertex: SimpleCfgVertex = { id: leaf.info.id, name: leaf.type, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.Break }; + graph.addVertex(vertex); + + return { graph, breaks: [leaf.info.id], nexts: [], returns: [], entryPoints: [leaf.info.id], exitPoints: [] }; +} + +function cfgNext(leaf: RNodeWithParent): SimpleControlFlowInfo { + const graph = new SimpleControlFlowGraph(); + const vertex: SimpleCfgVertex = { id: leaf.info.id, name: leaf.type, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.Next }; + graph.addVertex(vertex); + + return { graph, breaks: [], nexts: [leaf.info.id], returns: [], entryPoints: [leaf.info.id], exitPoints: [] }; +} + +function cfgIgnore(_leaf: RNodeWithParent): SimpleControlFlowInfo { + return { graph: new SimpleControlFlowGraph(), breaks: [], nexts: [], returns: [], entryPoints: [], exitPoints: [] }; +} + +function cfgIfThenElse(ifNode: RNodeWithParent, _condition: SimpleControlFlowInfo, then: SimpleControlFlowInfo, otherwise?: SimpleControlFlowInfo): SimpleControlFlowInfo { + const graph = then.graph; + const vertex: SimpleCfgVertex = { id: ifNode.info.id, name: ifNode.type, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.IfThenElse }; + graph.addVertex(vertex); + + if(otherwise) { + graph.merge(otherwise.graph); + } + + for(const entryPoint of then.entryPoints) { + graph.addEdge(ifNode.info.id, entryPoint, { label: 'CD', when: RTrue, caused: ifNode.info.id }); + } + for(const entryPoint of otherwise?.entryPoints ?? []) { + graph.addEdge(ifNode.info.id, entryPoint, { label: 'CD', when: RFalse, caused: ifNode.info.id }); + } + const exitPoints = [...then.exitPoints, ...otherwise?.exitPoints ?? []]; + + // add if-node itself as exit point if body is empty + if(then.entryPoints.length === 0) { + if(otherwise === undefined || otherwise.entryPoints.length === 0) { + exitPoints.push({ node: ifNode.info.id, edge: { label: 'FD' } }); + } else { + exitPoints.push({ node: ifNode.info.id, edge: { label: 'CD', when: RTrue, caused: ifNode.info.id } }); + } + } else if(otherwise === undefined || otherwise.entryPoints.length === 0) { + exitPoints.push({ node: ifNode.info.id, edge: { label: 'CD', when: RFalse, caused: ifNode.info.id } }); + } + + return { + graph, + breaks: [...then.breaks, ...otherwise?.breaks ?? []], + nexts: [...then.nexts, ...otherwise?.nexts ?? []], + returns: [...then.returns, ...otherwise?.returns ?? []], + entryPoints: [ifNode.info.id], + exitPoints + }; +} + +function cfgRepeat(repeat: RRepeatLoop, body: SimpleControlFlowInfo): SimpleControlFlowInfo { + const graph = body.graph; + const vertex: SimpleCfgVertex = { id: repeat.info.id, name: repeat.type, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.RepeatLoop }; + graph.addVertex(vertex); + + for(const entryPoint of body.entryPoints) { + graph.addEdge(repeat.info.id, entryPoint, { label: 'FD' }); + } + for(const next of body.nexts) { + graph.addEdge(next, repeat.info.id, { label: 'FD' }); + } + for(const exitPoint of body.exitPoints) { + graph.addEdge(exitPoint.node, repeat.info.id, exitPoint.edge); + } + const exitPoints = body.breaks.map(node => ({ node, edge: { label: 'FD' } })); + + return { graph, breaks: [], nexts: [], returns: body.returns, entryPoints: [repeat.info.id], exitPoints }; +} + +function cfgWhile(whileLoop: RWhileLoop, _condition: SimpleControlFlowInfo, body: SimpleControlFlowInfo): SimpleControlFlowInfo { + const graph = body.graph; + const vertex: SimpleCfgVertex = { id: whileLoop.info.id, name: whileLoop.type, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.WhileLoop }; + graph.addVertex(vertex); + + for(const entryPoint of body.entryPoints) { + graph.addEdge(whileLoop.info.id, entryPoint, { label: 'CD', when: RTrue, caused: whileLoop.info.id }); + } + for(const next of body.nexts) { + graph.addEdge(next, whileLoop.info.id, { label: 'FD' }); + } + for(const exitPoint of body.exitPoints) { + graph.addEdge(exitPoint.node, whileLoop.info.id, exitPoint.edge); + } + const exitPoints = body.breaks.map(node => ({ node, edge: { label: 'FD' } })); + exitPoints.push({ node: whileLoop.info.id, edge: { label: 'CD', when: RFalse, caused: whileLoop.info.id } }); + + return { graph, breaks: [], nexts: [], returns: body.returns, entryPoints: [whileLoop.info.id], exitPoints }; +} + +function cfgFor(forLoop: RForLoop, _variable: SimpleControlFlowInfo, _vector: SimpleControlFlowInfo, body: SimpleControlFlowInfo): SimpleControlFlowInfo { + const graph = body.graph; + const vertex: SimpleCfgVertex = { id: forLoop.info.id, name: forLoop.type, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.ForLoop }; + graph.addVertex(vertex); + + for(const entryPoint of body.entryPoints) { + graph.addEdge(forLoop.info.id, entryPoint, { label: 'CD', when: RTrue, caused: forLoop.info.id }); + } + for(const next of body.nexts) { + graph.addEdge(next, forLoop.info.id, { label: 'FD' }); + } + for(const exitPoint of body.exitPoints) { + graph.addEdge(exitPoint.node, forLoop.info.id, exitPoint.edge); + } + const exitPoints = body.breaks.map(node => ({ node, edge: { label: 'FD' } })); + exitPoints.push({ node: forLoop.info.id, edge: { label: 'CD', when: RFalse, caused: forLoop.info.id } }); + + return { graph, breaks: [], nexts: [], returns: body.returns, entryPoints: [forLoop.info.id], exitPoints }; +} + +function cfgFunctionDefinition(fn: RFunctionDefinition, _params: SimpleControlFlowInfo[], _body: SimpleControlFlowInfo): SimpleControlFlowInfo { + // skip function definitions for now + return cfgIgnore(fn); +} + +function cfgFunctionCall(call: RFunctionCall, _name: SimpleControlFlowInfo, _args: (SimpleControlFlowInfo | typeof EmptyArgument)[]): SimpleControlFlowInfo { + // no resolve for function call targets to track function definitions for now + return cfgLeaf(call); +} + +function cfgExprList(_node: RNodeWithParent, _grouping: unknown, expressions: SimpleControlFlowInfo[]): SimpleControlFlowInfo { + const result: SimpleControlFlowInfo = { graph: new SimpleControlFlowGraph(), breaks: [], nexts: [], returns: [], entryPoints: [], exitPoints: [] }; + let first = true; + + for(const expression of expressions) { + if(first) { + result.entryPoints = expression.entryPoints; + first = false; + } else { + for(const prevExitPoint of result.exitPoints) { + for(const entryPoint of expression.entryPoints) { + result.graph.addEdge(prevExitPoint.node, entryPoint, prevExitPoint.edge); + } + } + } + result.graph.merge(expression.graph); + result.breaks.push(...expression.breaks); + result.nexts.push(...expression.nexts); + result.returns.push(...expression.returns); + result.exitPoints = expression.exitPoints; + } + return result; +} diff --git a/src/cli/repl/commands/repl-cfg.ts b/src/cli/repl/commands/repl-cfg.ts index 8d485a833a..49fe68249d 100644 --- a/src/cli/repl/commands/repl-cfg.ts +++ b/src/cli/repl/commands/repl-cfg.ts @@ -5,6 +5,8 @@ import { fileProtocol, requestFromInput } from '../../../r-bridge/retriever'; import { cfgToMermaid, cfgToMermaidUrl } from '../../../util/mermaid/cfg'; import type { KnownParser } from '../../../r-bridge/parser'; import { ColorEffect, Colors, FontStyles } from '../../../util/ansi'; +import { extractSimpleCFG } from '../../../abstract-interpretation/simple-cfg'; +import { performDataFrameAbsint } from '../../../abstract-interpretation/data-frame/abstract-interpretation'; async function controlflow(parser: KnownParser, remainingLine: string) { return await createDataflowPipeline(parser, { @@ -57,3 +59,21 @@ export const controlflowStarCommand: ReplCommand = { } catch{ /* do nothing this is a service thing */ } } }; + +export const absintDataFrameCommand: ReplCommand = { + description: 'Perform abstract interpretation for data frames', + usageExample: ':absint-dataframe', + aliases: [ 'absintdf', 'aidf' ], + script: false, + fn: async(output, shell, remainingLine) => { + const result = await controlflow(shell, handleString(remainingLine)); + const cfg = extractSimpleCFG(result.normalize); + const mermaid = cfgToMermaidUrl(cfg, result.normalize); + try { + const clipboard = await import('clipboardy'); + clipboard.default.writeSync(mermaid); + output.stdout(formatInfo(output, 'mermaid url')); + } catch{ /* do nothing this is a service thing */ } + performDataFrameAbsint(cfg, result.dataflow.graph); + } +}; diff --git a/src/cli/repl/commands/repl-commands.ts b/src/cli/repl/commands/repl-commands.ts index d3743d32dc..cc9bed5214 100644 --- a/src/cli/repl/commands/repl-commands.ts +++ b/src/cli/repl/commands/repl-commands.ts @@ -7,7 +7,7 @@ import { parseCommand } from './repl-parse'; import { executeCommand } from './repl-execute'; import { normalizeCommand, normalizeStarCommand } from './repl-normalize'; import { dataflowCommand, dataflowStarCommand } from './repl-dataflow'; -import { controlflowCommand, controlflowStarCommand } from './repl-cfg'; +import { controlflowCommand, controlflowStarCommand, absintDataFrameCommand } from './repl-cfg'; import type { OutputFormatter } from '../../../util/ansi'; import { italic , bold } from '../../../util/ansi'; import { splitAtEscapeSensitive } from '../../../util/args'; @@ -75,20 +75,21 @@ You can combine commands by separating them with a semicolon ${bold(';',output.f * All commands that should be available in the REPL. */ const _commands: Record = { - 'help': helpCommand, - 'quit': quitCommand, - 'version': versionCommand, - 'execute': executeCommand, - 'parse': parseCommand, - 'normalize': normalizeCommand, - 'normalize*': normalizeStarCommand, - 'dataflow': dataflowCommand, - 'dataflow*': dataflowStarCommand, - 'controlflow': controlflowCommand, - 'controlflow*': controlflowStarCommand, - 'lineage': lineageCommand, - 'query': queryCommand, - 'query*': queryStarCommand + 'help': helpCommand, + 'quit': quitCommand, + 'version': versionCommand, + 'execute': executeCommand, + 'parse': parseCommand, + 'normalize': normalizeCommand, + 'normalize*': normalizeStarCommand, + 'dataflow': dataflowCommand, + 'dataflow*': dataflowStarCommand, + 'controlflow': controlflowCommand, + 'controlflow*': controlflowStarCommand, + 'lineage': lineageCommand, + 'query': queryCommand, + 'query*': queryStarCommand, + 'absint-dataframe': absintDataFrameCommand }; let commandsInitialized = false; diff --git a/src/dataflow/environments/built-in-config.ts b/src/dataflow/environments/built-in-config.ts index 2964e9ed5a..2c2a3d8aec 100644 --- a/src/dataflow/environments/built-in-config.ts +++ b/src/dataflow/environments/built-in-config.ts @@ -80,7 +80,7 @@ export function registerBuiltInFunctions mappedProcessor(name, args, rootId, data, config as any), config, name, diff --git a/src/dataflow/environments/built-in.ts b/src/dataflow/environments/built-in.ts index 36f5ad51eb..6a77535c63 100644 --- a/src/dataflow/environments/built-in.ts +++ b/src/dataflow/environments/built-in.ts @@ -30,11 +30,6 @@ import { registerBuiltInDefinitions } from './built-in-config'; import { DefaultBuiltinConfig } from './default-builtin-config'; import type { LinkTo } from '../../queries/catalog/call-context-query/call-context-query-format'; import { processList } from '../internal/process/functions/call/built-in/built-in-list'; -import { processDataFrameAccess } from '../../abstract-interpretation/data-frame/process/data-frame-access'; -import { processDataFrameAssignment } from '../../abstract-interpretation/data-frame/process/data-frame-assignment'; -import { processDataFrameExpressionList } from '../../abstract-interpretation/data-frame/process/data-frame-expression-list'; -import { processDataFrameFunctionCall } from '../../abstract-interpretation/data-frame/process/data-frame-function-call'; -import { processDataFramePipe } from '../../abstract-interpretation/data-frame/process/data-frame-pipe'; import { processVector } from '../internal/process/functions/call/built-in/built-in-vector'; import { processRm } from '../internal/process/functions/call/built-in/built-in-rm'; @@ -55,14 +50,6 @@ export type BuiltInIdentifierProcessorWithConfig = ( config: Config ) => DataflowInformation -export type BuiltInIdentifierProcessorDecorator = ( - name: RSymbol, - args: readonly RFunctionArgument[], - rootId: NodeId, - data: DataflowProcessorInformation, - config: Config -) => void - export interface BuiltInIdentifierDefinition extends IdentifierReference { type: ReferenceType.BuiltInFunction definedAt: typeof BuiltIn @@ -144,34 +131,19 @@ export function registerBuiltInFunctions( - processor: BuiltInIdentifierProcessorWithConfig, - ...decorators: BuiltInProcessorDecoratorName[] -): BuiltInIdentifierProcessorWithConfig { - return (name, args, rootId, data, config) => { - const result = processor(name, args, rootId, data, config); - decorators - .map(name => BuiltInProcessorDecoratorMapper[name] as BuiltInIdentifierProcessorDecorator) - .forEach(decorator => decorator(name, args, rootId, { ...data, environment: result.environment }, config)); - - return result; - }; -} - export const BuiltInProcessorMapper = { - 'builtin:default': decorateProcessor(defaultBuiltInProcessor, 'dataframe:function-call'), + 'builtin:default': defaultBuiltInProcessor, 'builtin:apply': processApply, - 'builtin:expression-list': decorateProcessor(processExpressionList, 'dataframe:expression-list'), + 'builtin:expression-list': processExpressionList, 'builtin:source': processSourceCall, - 'builtin:access': decorateProcessor(processAccess, 'dataframe:access'), + 'builtin:access': processAccess, 'builtin:if-then-else': processIfThenElse, 'builtin:get': processGet, 'builtin:rm': processRm, 'builtin:library': processLibrary, - 'builtin:assignment': decorateProcessor(processAssignment, 'dataframe:assignment'), + 'builtin:assignment': processAssignment, 'builtin:special-bin-op': processSpecialBinOp, - 'builtin:pipe': decorateProcessor(processPipe, 'dataframe:pipe'), + 'builtin:pipe': processPipe, 'builtin:function-definition': processFunctionDefinition, 'builtin:quote': processQuote, 'builtin:for-loop': processForLoop, @@ -180,22 +152,11 @@ export const BuiltInProcessorMapper = { 'builtin:replacement': processReplacementFunction, 'builtin:list': processList, 'builtin:vector': processVector, -// eslint-disable-next-line @typescript-eslint/no-explicit-any -} as const satisfies Record<`builtin:${string}`, BuiltInIdentifierProcessorWithConfig>; +} as const satisfies Record<`builtin:${string}`, BuiltInIdentifierProcessorWithConfig>; export type BuiltInMappingName = keyof typeof BuiltInProcessorMapper; export type ConfigOfBuiltInMappingName = Parameters[4]; -const BuiltInProcessorDecoratorMapper = { - 'dataframe:function-call': processDataFrameFunctionCall, - 'dataframe:access': processDataFrameAccess, - 'dataframe:assignment': processDataFrameAssignment, - 'dataframe:pipe': processDataFramePipe, - 'dataframe:expression-list': processDataFrameExpressionList -} as const satisfies Record<`${string}:${string}`, BuiltInIdentifierProcessorDecorator>; - -type BuiltInProcessorDecoratorName = keyof typeof BuiltInProcessorDecoratorMapper; - export const BuiltInMemory = new Map(); export const EmptyBuiltInMemory = new Map(); diff --git a/src/dataflow/environments/resolve-by-name.ts b/src/dataflow/environments/resolve-by-name.ts index f3bffa645c..23eac76cc1 100644 --- a/src/dataflow/environments/resolve-by-name.ts +++ b/src/dataflow/environments/resolve-by-name.ts @@ -18,7 +18,6 @@ import { envFingerprint } from '../../slicing/static/fingerprint'; import { EdgeType } from '../graph/edge'; import { EmptyArgument } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; import type { RNumberValue } from '../../r-bridge/lang-4.x/convert-values'; -import type { NoInfo } from '../../r-bridge/lang-4.x/ast/model/model'; const FunctionTargetTypes = ReferenceType.Function | ReferenceType.BuiltInFunction | ReferenceType.Unknown | ReferenceType.Argument | ReferenceType.Parameter; @@ -309,11 +308,11 @@ export function resolveValueOfVariable(identifier: Identifier | undefined, envir } } -export interface ResolveInfo { +export interface ResolveInfo { /** The current environment used for name resolution */ environment?: REnvironmentInformation; /** The id map to resolve the node if given as an id */ - idMap?: AstIdMap; + idMap?: AstIdMap; /** The graph to resolve in */ graph?: DataflowGraph; /** Whether to track variables */ diff --git a/test/functionality/abstract-interpretation/data-frame/data-frame.ts b/test/functionality/abstract-interpretation/data-frame/data-frame.ts index 8273c93c7f..6c100415d2 100644 --- a/test/functionality/abstract-interpretation/data-frame/data-frame.ts +++ b/test/functionality/abstract-interpretation/data-frame/data-frame.ts @@ -1,23 +1,21 @@ import { assert, beforeAll, test } from 'vitest'; import type { DataFrameDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; -import { leqColNames, leqInterval } from '../../../../src/abstract-interpretation/data-frame/domain'; -import type { AbstractInterpretationInfo } from '../../../../src/abstract-interpretation/data-frame/absint-info'; +import { DataFrameTop, leqColNames, leqInterval } from '../../../../src/abstract-interpretation/data-frame/domain'; import { PipelineExecutor } from '../../../../src/core/pipeline-executor'; import type { TREE_SITTER_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; import { createDataflowPipeline, DEFAULT_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; +import type { PipelineOutput } from '../../../../src/core/steps/pipeline/pipeline'; +import type { RSymbol } from '../../../../src/r-bridge/lang-4.x/ast/model/nodes/r-symbol'; +import type { ParentInformation } from '../../../../src/r-bridge/lang-4.x/ast/model/processing/decorate'; import { RType } from '../../../../src/r-bridge/lang-4.x/ast/model/type'; +import type { KnownParser } from '../../../../src/r-bridge/parser'; import { requestFromInput } from '../../../../src/r-bridge/retriever'; import type { RShell } from '../../../../src/r-bridge/shell'; import type { SingleSlicingCriterion, SlicingCriteria } from '../../../../src/slicing/criterion/parse'; import { slicingCriterionToId } from '../../../../src/slicing/criterion/parse'; import { assertUnreachable, guard, isNotUndefined } from '../../../../src/util/assert'; import { getRangeEnd } from '../../../../src/util/range'; -import type { RSymbol } from '../../../../src/r-bridge/lang-4.x/ast/model/nodes/r-symbol'; -import { resolveDataFrameValue } from '../../../../src/abstract-interpretation/data-frame/abstract-interpretation'; import { decorateLabelContext, type TestLabel } from '../../_helper/label'; -import type { ParentInformation } from '../../../../src/r-bridge/lang-4.x/ast/model/processing/decorate'; -import type { PipelineOutput } from '../../../../src/core/steps/pipeline/pipeline'; -import type { KnownParser } from '../../../../src/r-bridge/parser'; export enum DomainMatchingType { Exact = 'exact', @@ -42,7 +40,7 @@ export const DataFrameTestOverapproximation = { interface CriterionTestEntry { criterion: SingleSlicingCriterion, value: DataFrameDomain, - node: RSymbol, + node: RSymbol, lineNumber: number } @@ -160,7 +158,7 @@ function createCodeForOutput( function getInferredDomainForCriterion( result: PipelineOutput, criterion: SingleSlicingCriterion -): [DataFrameDomain, RSymbol] { +): [DataFrameDomain, RSymbol] { const idMap = result.dataflow.graph.idMap ?? result.normalize.idMap; const nodeId = slicingCriterionToId(criterion, idMap); const node = idMap.get(nodeId); @@ -168,7 +166,7 @@ function getInferredDomainForCriterion( if(node === undefined || node.type !== RType.Symbol) { throw new Error(`slicing criterion ${criterion} does not refer to a R symbol`); } - const value = resolveDataFrameValue(node, result.dataflow.environment); + const value = DataFrameTop; return [value, node]; } diff --git a/test/functionality/abstract-interpretation/data-frame/domain.test.ts b/test/functionality/abstract-interpretation/data-frame/domain.test.ts index 4e44c06820..569c3c903d 100644 --- a/test/functionality/abstract-interpretation/data-frame/domain.test.ts +++ b/test/functionality/abstract-interpretation/data-frame/domain.test.ts @@ -20,7 +20,7 @@ describe('Data Frame Domain', () => { }); }; check(ColNamesBottom, ColNamesBottom, true, ColNamesBottom, ColNamesBottom, ColNamesBottom); - check(ColNamesTop, ColNamesTop, true, ColNamesTop, ColNamesTop, ColNamesBottom); + check(ColNamesTop, ColNamesTop, true, ColNamesTop, ColNamesTop, ColNamesTop); check(ColNamesBottom, ColNamesTop, true, ColNamesTop, ColNamesBottom, ColNamesBottom); check(ColNamesTop, ColNamesBottom, false, ColNamesTop, ColNamesBottom, ColNamesTop); check(ColNamesBottom, ['id', 'age'], true, ['id', 'age'], ColNamesBottom, ColNamesBottom); @@ -30,7 +30,7 @@ describe('Data Frame Domain', () => { check(['id', 'age', 'score'], ['id', 'age'], false, ['id', 'age', 'score'], ['id', 'age'], ['score']); check(['id', 'age', 'score'], ['id', 'category'], false, ['id', 'age', 'score', 'category'], ['id'], ['age', 'score']); check(['id', 'category'], ['id', 'age', 'score'], false, ['id', 'age', 'score', 'category'], ['id'], ['category']); - check(['id', 'age'], ColNamesTop, true, ColNamesTop, ['id', 'age'], ColNamesBottom); + check(['id', 'age'], ColNamesTop, true, ColNamesTop, ['id', 'age'], ['id', 'age']); check(ColNamesTop, ['id', 'age'], false, ColNamesTop, ['id', 'age'], ColNamesTop); }); diff --git a/test/functionality/abstract-interpretation/data-frame/inference.test.ts b/test/functionality/abstract-interpretation/data-frame/inference.test.ts index b3046efd91..4c0dca0330 100644 --- a/test/functionality/abstract-interpretation/data-frame/inference.test.ts +++ b/test/functionality/abstract-interpretation/data-frame/inference.test.ts @@ -3,7 +3,7 @@ import { withShell } from '../../_helper/shell'; import { ColNamesTop, DataFrameTop } from '../../../../src/abstract-interpretation/data-frame/domain'; import { testDataFrameDomainAgainstReal, assertDataFrameDomain, DomainMatchingType, DataFrameTestOverapproximation } from './data-frame'; -describe.sequential('Data Frame Abstract Interpretation', withShell(shell => { +describe.sequential('Data Frame Abstract Interpretation', { skip: true }, withShell(shell => { assertDataFrameDomain( shell, 'df <- data.frame(id = 1:5, age = c(25, 32, 35, 40, 45), score = c(90, 85, 88, 92, 95), row.names = NULL)', diff --git a/test/functionality/abstract-interpretation/simple-cfg.test.ts b/test/functionality/abstract-interpretation/simple-cfg.test.ts new file mode 100644 index 0000000000..c739b86e1e --- /dev/null +++ b/test/functionality/abstract-interpretation/simple-cfg.test.ts @@ -0,0 +1,172 @@ +import { assert, describe, test } from 'vitest'; +import { PipelineExecutor } from '../../../src/core/pipeline-executor'; +import { DEFAULT_DATAFLOW_PIPELINE } from '../../../src/core/steps/pipeline/default-pipelines'; +import { RType } from '../../../src/r-bridge/lang-4.x/ast/model/type'; +import { RFalse, RTrue } from '../../../src/r-bridge/lang-4.x/convert-values'; +import { requestFromInput } from '../../../src/r-bridge/retriever'; +import { CfgVertexType, emptyControlFlowInformation } from '../../../src/util/cfg/cfg'; +import { cfgToMermaidUrl } from '../../../src/util/mermaid/cfg'; +import { withShell } from '../_helper/shell'; +import type { SimpleControlFlowInformation } from '../../../src/abstract-interpretation/simple-cfg'; +import { extractSimpleCFG, SimpleCfgVertexType, SimpleControlFlowGraph } from '../../../src/abstract-interpretation/simple-cfg'; + +describe.sequential('Simple Control Flow Graph', withShell(shell => { + function assertCfg(code: string, partialExpected: Partial) { + const expected: SimpleControlFlowInformation = { ...emptyControlFlowInformation(), graph: new SimpleControlFlowGraph(), ...partialExpected }; + + return test(code, async()=> { + const result = await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, { + parser: shell, + request: requestFromInput(code) + }).allRemainingSteps(); + const cfg = extractSimpleCFG(result.normalize); + + try { + assert.deepStrictEqual(cfg.entryPoints, expected.entryPoints, 'entry points differ'); + assert.deepStrictEqual(cfg.exitPoints, expected.exitPoints, 'exit points differ'); + assert.deepStrictEqual(cfg.breaks, expected.breaks, 'breaks differ'); + assert.deepStrictEqual(cfg.nexts, expected.nexts, 'nexts differ'); + assert.deepStrictEqual(cfg.returns, expected.returns, 'returns differ'); + assert.deepStrictEqual(cfg.graph.vertices(), expected.graph.vertices(), 'vertices differ'); + assert.deepStrictEqual(cfg.graph.edges(), expected.graph.edges(), 'edges differ'); + } /* v8 ignore next 4 */ catch(e: unknown) { + console.error(`expected: ${cfgToMermaidUrl(expected, result.normalize)}`); + console.error(`actual: ${cfgToMermaidUrl(cfg, result.normalize)}`); + throw e; + } + }); + } + + assertCfg('x <- 42\nx <- x + 4\nprint(x)', { + entryPoints: [2], + exitPoints: [11], + graph: new SimpleControlFlowGraph() + .addVertex({ id: 2, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 7, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 11, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addEdge(2, 7, { label: 'FD' }) + .addEdge(7, 11, { label: 'FD' }) + }); + + assertCfg('if(TRUE) 1', { + entryPoints: [3], + exitPoints: [1, 3], + graph: new SimpleControlFlowGraph() + .addVertex({ id: 3, name: RType.IfThenElse, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.IfThenElse }) + .addVertex({ id: 1, name: RType.Number, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addEdge(3, 1, { label: 'CD', when: RTrue, caused: 3 }) + }); + + assertCfg('df <- data.frame(id = 1:5)\nif(nrow(df) > 5) {\ndf$name <- "A"\n} else {\ndf$name <- "B"\n}\nprint(df)', { + entryPoints: [8], + exitPoints: [37], + graph: new SimpleControlFlowGraph() + .addVertex({ id: 8, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 33, name: RType.IfThenElse, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.IfThenElse }) + .addVertex({ id: 22, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 31, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 37, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addEdge(8, 33, { label: 'FD' }) + .addEdge(33, 22, { label: 'CD', when: RTrue, caused: 33 }) + .addEdge(33, 31, { label: 'CD', when: RFalse, caused: 33 }) + .addEdge(22, 37, { label: 'FD' }) + .addEdge(31, 37, { label: 'FD' }) + }); + + assertCfg('if (TRUE) {} else {}\nprint("Hello World!")', { + entryPoints: [7], + exitPoints: [11], + graph: new SimpleControlFlowGraph() + .addVertex({ id: 7, name: RType.IfThenElse, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.IfThenElse }) + .addVertex({ id: 11, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addEdge(7, 11, { label: 'FD' }) + }); + + assertCfg('if (TRUE) {} else {\nprint("Unreachable :)")\n}\nprint("Hello World!")', { + entryPoints: [11], + exitPoints: [15], + graph: new SimpleControlFlowGraph() + .addVertex({ id: 11, name: RType.IfThenElse, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.IfThenElse }) + .addVertex({ id: 9, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 15, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addEdge(11, 15, { label: 'CD', when: RTrue, caused: 11 }) + .addEdge(11, 9, { label: 'CD', when: RFalse, caused: 11 }) + .addEdge(9, 15, { label: 'FD' }) + }); + + assertCfg('repeat {\nbreak\n}\nprint("Hello World!")', { + entryPoints: [4], + exitPoints: [8], + graph: new SimpleControlFlowGraph() + .addVertex({ id: 4, name: RType.RepeatLoop, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.RepeatLoop }) + .addVertex({ id: 2, name: RType.Break, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.Break }) + .addVertex({ id: 8, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addEdge(4, 2, { label: 'FD' }) + .addEdge(2, 8, { label: 'FD' }) + }); + + assertCfg('df <- data.frame(id = 1:5)\nfor (x in 1:10) {\ndf <- rbind(df, x)\n}\nprint(df)', { + entryPoints: [8], + exitPoints: [28], + graph: new SimpleControlFlowGraph() + .addVertex({ id: 8, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 24, name: RType.ForLoop, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.ForLoop }) + .addVertex({ id: 22, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 28, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addEdge(8, 24, { label: 'FD' }) + .addEdge(24, 22, { label: 'CD', when: RTrue, caused: 24 }) + .addEdge(24, 28, { label: 'CD', when: RFalse, caused: 24 }) + .addEdge(22, 24, { label: 'FD' }) + }); + + assertCfg('x <- 42\nfor (i in 1:n) {\nif (i == 5) {\nbreak\n}\n}\nprint(x)', { + entryPoints: [2], + exitPoints: [22], + graph: new SimpleControlFlowGraph() + .addVertex({ id: 2, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 18, name: RType.ForLoop, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.ForLoop }) + .addVertex({ id: 16, name: RType.IfThenElse, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.IfThenElse }) + .addVertex({ id: 14, name: RType.Break, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.Break }) + .addVertex({ id: 22, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addEdge(2, 18, { label: 'FD' }) + .addEdge(18, 16, { label: 'CD', when: RTrue, caused: 18 }) + .addEdge(18, 22, { label: 'CD', when: RFalse, caused: 18 }) + .addEdge(16, 14, { label: 'CD', when: RTrue, caused: 16 }) + .addEdge(16, 18, { label: 'CD', when: RFalse, caused: 16 }) + .addEdge(14, 22, { label: 'FD' }) + }); + + assertCfg('x <- 42\nwhile(TRUE) {}\nprint("Unreachable :D!")', { + entryPoints: [2], + exitPoints: [11], + graph: new SimpleControlFlowGraph() + .addVertex({ id: 2, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 7, name: RType.WhileLoop, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.WhileLoop }) + .addVertex({ id: 11, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addEdge(2, 7, { label: 'FD' }) + .addEdge(7, 11, { label: 'CD', when: RFalse, caused: 7 }) + }); + + assertCfg('x <- runif(50)\nif (n > 0) {\nif (n == 42) {\nx <- 44\n} else {\nx <- 5\n}\nprint(x)\n} else {\nx <- n\n}\nprint(x)', { + entryPoints: [5], + exitPoints: [42], + graph: new SimpleControlFlowGraph() + .addVertex({ id: 5, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 38, name: RType.IfThenElse, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.IfThenElse }) + .addVertex({ id: 26, name: RType.IfThenElse, type: CfgVertexType.Statement, tag: SimpleCfgVertexType.IfThenElse }) + .addVertex({ id: 18, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 24, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 30, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 36, name: RType.BinaryOp, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addVertex({ id: 42, name: RType.FunctionCall, type: CfgVertexType.Expression, tag: SimpleCfgVertexType.Expression }) + .addEdge(5, 38, { label: 'FD' }) + .addEdge(38, 26, { label: 'CD', when: RTrue, caused: 38 }) + .addEdge(38, 36, { label: 'CD', when: RFalse, caused: 38 }) + .addEdge(26, 18, { label: 'CD', when: RTrue, caused: 26 }) + .addEdge(26, 24, { label: 'CD', when: RFalse, caused: 26 }) + .addEdge(18, 30, { label: 'FD' }) + .addEdge(24, 30, { label: 'FD' }) + .addEdge(30, 42, { label: 'FD' }) + .addEdge(36, 42, { label: 'FD' }) + }); +})); From 01db33ca12e6d7004c5d3cf24edef0d47e7d1a61 Mon Sep 17 00:00:00 2001 From: Oliver Date: Sun, 20 Apr 2025 16:03:45 +0200 Subject: [PATCH 10/11] Adapt the data frame processors and semantics to the new CFG (#1531) * feat: basic fold over CFG for forward traversal * feat: add simplified forward-connected control flow graph * feat-fix: correct label for CFG exit point edges * test: unit tests for simple control flow graph * feat: add CFG visitor for fixpoint iteration for data frames * feat-fix: add missing negation in has-changed check * feat: setup for data frame processors and semantics * feat: directly evaluate arguments when processing function * feat: add semantics mapper for column/row access and assignment * feat: implement data frame processors for basic nodes * feat: return resulting data frame domain * feat: support different types of inferred constraints * test: add tests for data frame state domain * feat: store abstract state in nodes * feat: support control flow constructs in abstract interpretation * test: add tests for control flow support * feat: support colnames assignment, cbind and rbind --- .../data-frame/absint-info.ts | 38 +++ .../data-frame/abstract-interpretation.ts | 163 ++++++++-- .../data-frame/domain.ts | 86 +++-- .../data-frame/mappers/access-mapper.ts | 119 +++++++ .../data-frame/mappers/assignment-mapper.ts | 188 +++++++++++ .../data-frame/mappers/function-mapper.ts | 233 ++++++++++++++ .../data-frame/processor.ts | 196 ++++++++++-- .../data-frame/resolve-args.ts | 103 ++++-- .../data-frame/semantics-mapper.ts | 38 +++ .../data-frame/semantics.ts | 223 +++++++++++++ src/cli/repl/commands/repl-cfg.ts | 16 +- src/dataflow/environments/resolve-by-name.ts | 5 +- src/util/cfg/cfg.ts | 57 +++- src/util/r-value.ts | 74 +++++ .../data-frame/data-frame.ts | 38 ++- .../data-frame/domain.test.ts | 187 ++++++++--- .../data-frame/inference.test.ts | 294 +++++++++++++----- 17 files changed, 1789 insertions(+), 269 deletions(-) create mode 100644 src/abstract-interpretation/data-frame/absint-info.ts create mode 100644 src/abstract-interpretation/data-frame/mappers/access-mapper.ts create mode 100644 src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts create mode 100644 src/abstract-interpretation/data-frame/mappers/function-mapper.ts create mode 100644 src/abstract-interpretation/data-frame/semantics-mapper.ts create mode 100644 src/abstract-interpretation/data-frame/semantics.ts create mode 100644 src/util/r-value.ts diff --git a/src/abstract-interpretation/data-frame/absint-info.ts b/src/abstract-interpretation/data-frame/absint-info.ts new file mode 100644 index 0000000000..41f1173e4a --- /dev/null +++ b/src/abstract-interpretation/data-frame/absint-info.ts @@ -0,0 +1,38 @@ +import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; +import type { DataFrameStateDomain } from './domain'; +import type { DataFrameOperationArgs, DataFrameOperationName } from './semantics'; + +export interface DataFrameOperation { + operation: Name, + operand: NodeId | undefined, + args: DataFrameOperationArgs +} + +type DataFrameOperations = { + [Name in DataFrameOperationName]: DataFrameOperation; +}[DataFrameOperationName]; + +interface DataFrameInfoBase { + domain?: DataFrameStateDomain +} + +export interface DataFrameAssignmentInfo { + type: 'assignment', + identifier: NodeId, + expression: NodeId +} + +export interface DataFrameExpressionInfo { + type: 'expression', + operations: DataFrameOperations[] +} + +export interface DataFrameOtherInfo { + type: 'other' +} + +export type DataFrameInfo = DataFrameAssignmentInfo | DataFrameExpressionInfo | DataFrameOtherInfo; + +export interface AbstractInterpretationInfo { + dataFrame?: DataFrameInfo & DataFrameInfoBase +} diff --git a/src/abstract-interpretation/data-frame/abstract-interpretation.ts b/src/abstract-interpretation/data-frame/abstract-interpretation.ts index b2e1ba723f..2c0e71100d 100644 --- a/src/abstract-interpretation/data-frame/abstract-interpretation.ts +++ b/src/abstract-interpretation/data-frame/abstract-interpretation.ts @@ -1,49 +1,144 @@ import type { DataflowGraph } from '../../dataflow/graph/graph'; +import type { RConstant, RNode, RSingleNode } from '../../r-bridge/lang-4.x/ast/model/model'; +import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; -import { RFalse, RTrue } from '../../r-bridge/lang-4.x/convert-values'; -import { guard } from '../../util/assert'; -import type { CfgEdge } from '../../util/cfg/cfg'; -import type { SimpleControlFlowGraph, SimpleControlFlowInformation } from '../simple-cfg'; +import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; +import { CfgVertexType, ControlFlowGraph, type CfgVertex, type ControlFlowInformation } from '../../util/cfg/cfg'; +import type { AbstractInterpretationInfo } from './absint-info'; import type { DataFrameDomain, DataFrameStateDomain } from './domain'; -import { equalDataFrameState } from './domain'; -import { processDataFrameNode } from './processor'; +import { equalDataFrameState, joinDataFrameStates } from './domain'; +import { processDataFrameExpression, processDataFrameLeaf } from './processor'; -export function performDataFrameAbsint(cfg: SimpleControlFlowInformation, dfg: DataflowGraph) { - const visited: Set = new Set(); +export function performDataFrameAbsint(cfinfo: ControlFlowInformation, dfg: DataflowGraph): DataFrameStateDomain { + const visited: Map = new Map(); + let finalDomain: DataFrameStateDomain = new Map(); - const visitor = (cfg: SimpleControlFlowGraph, nodeId: NodeId, domain: DataFrameStateDomain) => { - const node = dfg.idMap?.get(nodeId); - guard(node !== undefined, 'Node must not be undefined'); + const visitor = (cfg: ControlFlowGraph, vertex: CfgVertex): CfgVertex[] => { + if(shouldSkipVertex(vertex, dfg)) { + return getSuccessorVertices(cfg, vertex.id, dfg); + } + const predecessors = getPredecessorNodes(cfg, vertex.id, dfg); + const inputDomain = joinDataFrameStates(...predecessors.map(node => node.info.dataFrame?.domain ?? new Map())); + let oldDomain = new Map(); + let newDomain = inputDomain; - const result = processDataFrameNode(node, domain, dfg); - const equal = 'FD' in result ? { - 'FD': equalDataFrameState(domain, result['FD']), - [RTrue]: equalDataFrameState(domain, result[RTrue]), - [RFalse]: equalDataFrameState(domain, result[RFalse]) - } : equalDataFrameState(domain, result); + const entryNode: RNode | undefined = dfg.idMap?.get(vertex.id); - const hasChanged = (edge: CfgEdge) => typeof equal === 'object' ? !equal[edge.label === 'FD' ? edge.label : edge.when] : !equal; - const getDomain = (edge: CfgEdge) => 'FD' in result ? result[edge.label === 'FD' ? edge.label : edge.when] : result; + if(entryNode !== undefined && isRSingleNode(entryNode)) { + oldDomain = entryNode.info.dataFrame?.domain ?? oldDomain; + newDomain = processDataFrameLeaf(entryNode, new Map(inputDomain), dfg); + } + if(vertex.type === CfgVertexType.EndMarker) { + const exitId = getNodeIdForExitVertex(vertex.id); + const exitNode: RNode | undefined = exitId !== undefined ? dfg.idMap?.get(exitId) : undefined; - const successors = cfg.edges().get(nodeId)?.entries() - .filter(([successor, edge]) => !visited.has(successor) || hasChanged(edge)) - .map<[NodeId, DataFrameStateDomain]>(([successor, edge]) => [successor, getDomain(edge)]) - .toArray(); + if(exitNode !== undefined && !isRSingleNode(exitNode)) { + oldDomain = exitNode.info.dataFrame?.domain ?? oldDomain; + newDomain = processDataFrameExpression(exitNode, new Map(inputDomain), dfg); + } + } + if(cfinfo.exitPoints.includes(vertex.id)) { + finalDomain = newDomain; + } + visited.set(vertex.id, (visited.get(vertex.id) ?? 0) + 1); - successors?.forEach(([successor]) => visited.add(successor)); - - return successors ?? []; + return getSuccessorVertices(cfg, vertex.id, dfg) + .filter(successor => !visited.has(successor.id) || !equalDataFrameState(newDomain, oldDomain)); }; - foldGraph(cfg.graph, cfg.entryPoints.map((entry) => [entry, new Map()]), visitor); + const cfg = flipCfg(cfinfo.graph); + const entryPoints = cfinfo.entryPoints + .map(id => cfg.vertices().get(id)) + .filter(vertex => vertex !== undefined); + + foldCfg(cfg, entryPoints, visitor); + return finalDomain; +} + +export function flipCfg(cfg: ControlFlowGraph): ControlFlowGraph { + const flippedCfg = new ControlFlowGraph(); + + for(const [id, vertex] of cfg.vertices()) { + flippedCfg.addVertex(vertex, cfg.rootVertexIds().has(id)); + } + for(const [to, edges] of cfg.edges()) { + for(const [from, edge] of edges) { + flippedCfg.addEdge(from, to, edge); + } + } + return flippedCfg; } -function foldGraph( - cfg: SimpleControlFlowGraph, - nodes: [NodeId, DataFrameStateDomain][], - visitor: (cfg: SimpleControlFlowGraph, node: NodeId, domain: DataFrameStateDomain) => [NodeId, DataFrameStateDomain][] +function foldCfg( + cfg: ControlFlowGraph, + vertices: CfgVertex[], + visitor: (cfg: ControlFlowGraph, vertex: CfgVertex) => CfgVertex[] ): void { - for(const [node, domain] of nodes) { - const successors = visitor(cfg, node, domain); - foldGraph(cfg, successors, visitor); + for(const vertex of vertices) { + const successors = visitor(cfg, vertex); + foldCfg(cfg, successors, visitor); } } + +function isRConstant( + node: RNode +): node is RConstant { + return node.type === RType.String || node.type === RType.Number || node.type === RType.Logical; +} + +function isRSingleNode( + node: RNode +): node is RSingleNode { + return isRConstant(node) || node.type === RType.Symbol || node.type === RType.Break || node.type === RType.Next || node.type === RType.Comment || node.type === RType.LineDirective; +} + +// We only process vertices of leaf nodes and exit vertices (no entry nodes of complex nodes) +function shouldSkipVertex(vertex: CfgVertex, dfg: DataflowGraph) { + if(vertex.type === CfgVertexType.EndMarker) { + return false; + } else if(vertex.type === CfgVertexType.MidMarker) { + return true; + } + const node = dfg.idMap?.get(vertex.id); + + return node === undefined || !isRSingleNode(node); +} + +function getNodeIdForExitVertex(vertexId: NodeId): number | undefined { + if(typeof vertexId === 'number') { + return vertexId; + } + const nodeId = Number(vertexId.match(/^(\d+)/)?.[1]); + + return nodeId !== undefined && !isNaN(nodeId) ? nodeId : undefined; +} + +function getPredecessorNodes(cfg: ControlFlowGraph, vertexId: NodeId, dfg: DataflowGraph): RNode[] { + return cfg.ingoing(vertexId)?.keys() + .map(id => cfg.vertices().get(id)) + .flatMap(vertex => { + if(vertex !== undefined && shouldSkipVertex(vertex, dfg)) { + return getPredecessorNodes(cfg, vertex.id, dfg); + } else if(vertex?.type === CfgVertexType.EndMarker) { + const nodeId = getNodeIdForExitVertex(vertex.id); + return nodeId ? [dfg.idMap?.get(nodeId)] : []; + } else { + return vertex ? [dfg.idMap?.get(vertex.id)] : []; + } + }) + .filter(node => node !== undefined) + .toArray() ?? []; +} + +function getSuccessorVertices(cfg: ControlFlowGraph, vertexId: NodeId, dfg: DataflowGraph): CfgVertex[] { + return cfg.outgoing(vertexId)?.keys() + .map(id => cfg.vertices().get(id)) + .flatMap(vertex => { + if(vertex !== undefined && shouldSkipVertex(vertex, dfg)) { + return getSuccessorVertices(cfg, vertex.id, dfg); + } else { + return [vertex]; + } + }) + .filter(vertex => vertex !== undefined) + .toArray() ?? []; +} diff --git a/src/abstract-interpretation/data-frame/domain.ts b/src/abstract-interpretation/data-frame/domain.ts index 87a6b81703..eaf24f3ac7 100644 --- a/src/abstract-interpretation/data-frame/domain.ts +++ b/src/abstract-interpretation/data-frame/domain.ts @@ -99,37 +99,51 @@ export function meetInterval(X1: IntervalDomain, X2: IntervalDomain): IntervalDo } } -export function joinDataFrames(...values: DataFrameDomain[]) { - let value = values[0] ?? DataFrameTop; - - for(let i = 1; i < values.length; i++) { - value = { - colnames: joinColNames(value.colnames, values[i].colnames), - cols: joinInterval(value.cols, values[i].cols), - rows: joinInterval(value.rows, values[i].rows) - }; +export function addInterval(X1: IntervalDomain, X2: IntervalDomain): IntervalDomain { + if(X1 === IntervalBottom || X2 === IntervalBottom) { + return IntervalBottom; + } else { + return [X1[0] + X2[0], X1[1] + X2[1]]; } - return value; } -export function meetDataFrames(...values: DataFrameDomain[]) { - let value = values[0] ?? DataFrameTop; +export function subtractInterval(X1: IntervalDomain, X2: IntervalDomain): IntervalDomain { + if(X1 === IntervalBottom || X2 === IntervalBottom) { + return IntervalBottom; + } else { + return [X1[0] - X2[0], X1[1] - X2[1]]; + } +} - for(let i = 1; i < values.length; i++) { - value = { - colnames: meetColNames(value.colnames, values[i].colnames), - cols: meetInterval(value.cols, values[i].cols), - rows: meetInterval(value.rows, values[i].rows) - }; +export function includeZeroInterval(X: IntervalDomain): IntervalDomain { + if(X === IntervalBottom) { + return IntervalBottom; + } else { + return [0, X[1]]; } - return value; } -export function equalDataFrameDomain(X1: DataFrameDomain, X2: DataFrameDomain) { +export function equalDataFrameDomain(X1: DataFrameDomain, X2: DataFrameDomain): boolean { return equalColNames(X1.colnames, X2.colnames) && equalInterval(X1.cols, X2.cols) && equalInterval(X1.rows, X2.rows); } -export function equalDataFrameState(R1: DataFrameStateDomain, R2: DataFrameStateDomain) { +export function joinDataFrames(...values: DataFrameDomain[]): DataFrameDomain { + return values.slice(1).reduce((a, b) => ({ + colnames: joinColNames(a.colnames, b.colnames), + cols: joinInterval(a.cols, b.cols), + rows: joinInterval(a.rows, b.rows) + }), values[0] ?? DataFrameTop); +} + +export function meetDataFrames(...values: DataFrameDomain[]): DataFrameDomain { + return values.slice(1).reduce((a, b) => ({ + colnames: meetColNames(a.colnames, b.colnames), + cols: meetInterval(a.cols, b.cols), + rows: meetInterval(a.rows, b.rows) + }), values[0] ?? DataFrameTop); +} + +export function equalDataFrameState(R1: DataFrameStateDomain, R2: DataFrameStateDomain): boolean { if(R1.size !== R2.size) { return false; } @@ -141,3 +155,33 @@ export function equalDataFrameState(R1: DataFrameStateDomain, R2: DataFrameState } return true; } + +export function joinDataFrameStates(...values: DataFrameStateDomain[]): DataFrameStateDomain { + const result = new Map(values[0]); + + for(const domain of values.slice(1)) { + for(const [nodeId, value] of domain) { + if(result.has(nodeId)) { + result.set(nodeId, joinDataFrames(result.get(nodeId) ?? DataFrameTop, value)); + } else { + result.set(nodeId, value); + } + } + } + return result; +} + +export function meetDataFrameStates(...values: DataFrameStateDomain[]): DataFrameStateDomain { + const result = new Map(values[0]); + + for(const domain of values.slice(1)) { + for(const [nodeId, value] of domain) { + if(result.has(nodeId)) { + result.set(nodeId, meetDataFrames(result.get(nodeId) ?? DataFrameTop, value)); + } else { + result.set(nodeId, value); + } + } + } + return result; +} diff --git a/src/abstract-interpretation/data-frame/mappers/access-mapper.ts b/src/abstract-interpretation/data-frame/mappers/access-mapper.ts new file mode 100644 index 0000000000..9a10a83365 --- /dev/null +++ b/src/abstract-interpretation/data-frame/mappers/access-mapper.ts @@ -0,0 +1,119 @@ +import type { ResolveInfo } from '../../../dataflow/environments/resolve-by-name'; +import type { DataflowGraph } from '../../../dataflow/graph/graph'; +import type { RNode } from '../../../r-bridge/lang-4.x/ast/model/model'; +import type { RIndexAccess, RNamedAccess } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-access'; +import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; +import type { DataFrameInfo } from '../absint-info'; +import { resolveIdToArgValue, resolveIdToArgValueSymbolName } from '../resolve-args'; +import { isStringBasedAccess } from '../semantics-mapper'; + +const SpecialAccessArgumentsMapper: Partial> = { + '[': ['drop'], + '[[': ['exact'] +}; + +export function mapDataFrameAccess( + node: RNode, + dfg: DataflowGraph +): DataFrameInfo | undefined { + if(node.type === RType.Access) { + if(isStringBasedAccess(node)) { + return mapDataFrameNamedColumnAccess(node, { graph: dfg, idMap: dfg.idMap, full: true }); + } else { + return mapDataFrameIndexColRowAccess(node, { graph: dfg, idMap: dfg.idMap, full: true }); + } + } +} + +function mapDataFrameNamedColumnAccess( + access: RNamedAccess, + info: ResolveInfo +): DataFrameInfo { + const argName = resolveIdToArgValueSymbolName(access.access[0], info); + + return { + type: 'expression', + operations: [{ + operation: 'accessCol', + operand: access.accessed.info.id, + args: { columns: argName ? [argName] : undefined } + }] + }; +} + +function mapDataFrameIndexColRowAccess( + access: RIndexAccess, + info: ResolveInfo +): DataFrameInfo { + const args = getEffectiveArgs(access.operator, access.access); + + if(args.every(arg => arg === EmptyArgument)) { + return { + type: 'expression', + operations: [{ + operation: 'identity', + operand: access.accessed.info.id, + args: {} + }] + }; + } else if(args.length > 0 && args.length <= 2) { + const rowArg = args.length < 2 ? undefined : args[0]; + const colArg = args.length < 2 ? args[0] : args[1]; + + const result: DataFrameInfo = { type: 'expression', operations: [] }; + + if(rowArg !== undefined && rowArg !== EmptyArgument) { + const rowValue: unknown = resolveIdToArgValue(rowArg, info); + let rows: number[] | undefined = undefined; + + if(typeof rowValue === 'number') { + rows = [rowValue]; + } else if(Array.isArray(rowValue) && rowValue.every(row => typeof row === 'number')) { + rows = rowValue; + } + result.operations.push({ + operation: 'accessRow', + operand: access.accessed.info.id, + args: { rows } + }); + } + if(colArg !== undefined && colArg !== EmptyArgument) { + const colValue: unknown = resolveIdToArgValue(colArg, info); + let columns: string[] | number[] | undefined = undefined; + + if(typeof colValue === 'string') { + columns = [colValue]; + } else if(typeof colValue === 'number') { + columns = [colValue]; + } else if(Array.isArray(colValue) && (colValue.every(col => typeof col === 'string') || colValue.every(col => typeof col === 'number'))) { + columns = colValue; + } + result.operations.push({ + operation: 'accessCol', + operand: access.accessed.info.id, + args: { columns } + }); + } + return result; + } + return { + type: 'expression', + operations: [{ + operation: 'unknown', + operand: access.accessed.info.id, + args: { modifyInplace: true } + }] + }; +} + +function getEffectiveArgs( + funct: keyof typeof SpecialAccessArgumentsMapper, + args: readonly RFunctionArgument[] +): readonly RFunctionArgument[] { + const ignoredArgs = SpecialAccessArgumentsMapper[funct] ?? []; + + return args.filter(arg => arg === EmptyArgument || arg.name === undefined || !ignoredArgs.includes(arg.name.content)); +} diff --git a/src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts b/src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts new file mode 100644 index 0000000000..95d5a6a337 --- /dev/null +++ b/src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts @@ -0,0 +1,188 @@ +import type { ResolveInfo } from '../../../dataflow/environments/resolve-by-name'; +import type { DataflowGraph } from '../../../dataflow/graph/graph'; +import { toUnnamedArgument } from '../../../dataflow/internal/process/functions/call/argument/make-argument'; +import type { RNode } from '../../../r-bridge/lang-4.x/ast/model/model'; +import type { RIndexAccess, RNamedAccess } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-access'; +import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { RString } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-string'; +import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; +import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; +import type { DataFrameInfo } from '../absint-info'; +import { resolveIdToArgStringVector, resolveIdToArgValue, resolveIdToArgValueSymbolName } from '../resolve-args'; +import { isStringBasedAccess } from '../semantics-mapper'; + +const DataFrameAssignmentFunctionMapper = { + 'colnames': mapDataFrameColNamesAssignment, + 'names': mapDataFrameColNamesAssignment, + 'rownames': mapDataFrameRowNamesAssignment, + 'dimnames': mapDataFrameDimNamesAssignment +} as const satisfies Record; + +type DataFrameAssignmentFunctionMapping = ( + operand: RFunctionArgument, + expression: RNode, + info: ResolveInfo +) => DataFrameInfo | undefined; + +type DataFrameAssignmentFunction = keyof typeof DataFrameAssignmentFunctionMapper; + +export function mapDataFrameAssignment( + node: RNode, + dfg: DataflowGraph +): DataFrameInfo | undefined { + if(node.type === RType.BinaryOp && node.lhs !== undefined && node.rhs !== undefined) { + if(node.lhs.type === RType.Symbol || node.lhs.type === RType.String) { + return mapDataFrameVariableAssignment(node.lhs, node.rhs); + } else if(node.lhs.type === RType.Access) { + if(isStringBasedAccess(node.lhs)) { + return mapDataFrameNamedColumnAssignment(node.lhs, node.rhs, { graph: dfg, idMap: dfg.idMap, full: true }); + } else { + return mapDataFrameIndexColRowAssignment(node.lhs, node.rhs, { graph: dfg, idMap: dfg.idMap, full: true }); + } + } else if(node.lhs.type === RType.FunctionCall && node.lhs.named) { + if(node.lhs.functionName.content in DataFrameAssignmentFunctionMapper && node.lhs.arguments.length > 0) { + const functionName = node.lhs.functionName.content as DataFrameAssignmentFunction; + const functionProcessor = DataFrameAssignmentFunctionMapper[functionName]; + + return functionProcessor(node.lhs.arguments[0], node.rhs, { graph: dfg, idMap: dfg.idMap, full: true }); + } + } + } +} + +function mapDataFrameVariableAssignment( + identifier: RSymbol | RString, + expression: RNode +): DataFrameInfo { + return { + type: 'assignment', + identifier: identifier.info.id, + expression: expression.info.id + }; +} + +function mapDataFrameNamedColumnAssignment( + access: RNamedAccess, + expression: RNode, + info: ResolveInfo +): DataFrameInfo { + const argName = resolveIdToArgValueSymbolName(access.access[0], info); + + return { + type: 'expression', + operations: [{ + operation: 'assignCol', + operand: access.accessed.info.id, + args: { columns: argName ? [argName] : undefined } + }] + }; +} + +function mapDataFrameIndexColRowAssignment( + access: RIndexAccess, + expression: RNode, + info: ResolveInfo +): DataFrameInfo { + const args = access.access; + + if(args.length === 0 || args.every(arg => arg === EmptyArgument)) { + return { + type: 'expression', + operations: [{ + operation: 'identity', + operand: access.accessed.info.id, + args: {} + }] + }; + } + const rowArg = args.length < 2 ? undefined : args[0]; + const colArg = args.length < 2 ? args[0] : args[1]; + + const result: DataFrameInfo = { type: 'expression', operations: [] }; + + if(rowArg !== undefined && rowArg !== EmptyArgument) { + const rowValue: unknown = resolveIdToArgValue(rowArg, info); + let rows: number[] | undefined = undefined; + + if(typeof rowValue === 'number') { + rows = [rowValue]; + } else if(Array.isArray(rowValue) && rowValue.every(row => typeof row === 'number')) { + rows = rowValue; + } + result.operations.push({ + operation: 'assignRow', + operand: access.accessed.info.id, + args: { rows } + }); + } + if(colArg !== undefined && colArg !== EmptyArgument) { + const colValue: unknown = resolveIdToArgValue(colArg, info); + let columns: string[] | number[] | undefined = undefined; + + if(typeof colValue === 'string') { + columns = [colValue]; + } else if(typeof colValue === 'number') { + columns = [colValue]; + } else if(Array.isArray(colValue) && (colValue.every(col => typeof col === 'string') || colValue.every(col => typeof col === 'number'))) { + columns = colValue; + } + result.operations.push({ + operation: 'assignCol', + operand: access.accessed.info.id, + args: { columns } + }); + } + return result; +} + +function mapDataFrameColNamesAssignment( + operand: RFunctionArgument, + expression: RNode, + info: ResolveInfo +): DataFrameInfo | undefined { + if(operand !== EmptyArgument && operand?.value !== undefined && info.idMap) { + const argument = toUnnamedArgument(expression, info.idMap); + const assignedNames = resolveIdToArgStringVector(argument, info); + + return { + type: 'expression', + operations: [{ + operation: 'setColNames', + operand: operand.value.info.id, + args: { colnames: assignedNames } + }] + }; + } +} + +function mapDataFrameRowNamesAssignment( + operand: RFunctionArgument +): DataFrameInfo | undefined { + if(operand !== EmptyArgument && operand?.value !== undefined) { + return { + type: 'expression', + operations: [{ + operand: operand.value.info.id, + operation: 'identity', + args: {} + }] + }; + } +} + +function mapDataFrameDimNamesAssignment( + operand: RFunctionArgument +): DataFrameInfo | undefined { + if(operand !== EmptyArgument && operand.value !== undefined) { + return { + type: 'expression', + operations: [{ + operand: operand.value.info.id, + operation: 'unknown', + args: { modifyInplace: true } + }] + }; + } +} diff --git a/src/abstract-interpretation/data-frame/mappers/function-mapper.ts b/src/abstract-interpretation/data-frame/mappers/function-mapper.ts new file mode 100644 index 0000000000..3072d32e8c --- /dev/null +++ b/src/abstract-interpretation/data-frame/mappers/function-mapper.ts @@ -0,0 +1,233 @@ +import type { ResolveInfo } from '../../../dataflow/environments/resolve-by-name'; +import type { DataflowGraph } from '../../../dataflow/graph/graph'; +import { VertexType } from '../../../dataflow/graph/vertex'; +import { toUnnamedArgument } from '../../../dataflow/internal/process/functions/call/argument/make-argument'; +import type { RNode } from '../../../r-bridge/lang-4.x/ast/model/model'; +import type { RFunctionArgument, RFunctionCall } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; +import { startAndEndsWith } from '../../../util/strings'; +import type { AbstractInterpretationInfo, DataFrameInfo } from '../absint-info'; +import { DataFrameTop } from '../domain'; +import { resolveIdToArgName, resolveIdToArgValue, resolveIdToArgVectorLength } from '../resolve-args'; + +const ColNamesRegex = /^[A-Za-z.][A-Za-z0-9_.]*$/; + +const DataFrameFunctionMapper = { + 'data.frame': mapDataFrameCreate, + 'as.data.frame': mapDataFrameUnknownCreate, + 'read.csv': mapDataFrameUnknownCreate, + 'read.table': mapDataFrameUnknownCreate, + 'cbind': mapDataFrameColBind, + 'rbind': mapDataFrameRowBind +} as const satisfies Record; + +const SpecialFunctionArgumentsMapper: Partial> = { + 'data.frame': ['row.names', 'check.rows', 'check.names', 'fix.empty.names', 'stringsAsFactors'] +}; + +type DataFrameFunctionMapping = ( + args: readonly RFunctionArgument[], + info: ResolveInfo +) => DataFrameInfo | undefined; + +type DataFrameFunction = keyof typeof DataFrameFunctionMapper; + +export function mapDataFrameFunctionCall( + node: RNode, + dfg: DataflowGraph +): DataFrameInfo | undefined { + if(node.type === RType.FunctionCall && node.named && node.functionName.content in DataFrameFunctionMapper) { + const args = getFunctionArguments(node, dfg); + const functionName = node.functionName.content as DataFrameFunction; + const functionProcessor = DataFrameFunctionMapper[functionName]; + + return functionProcessor(args, { graph: dfg, idMap: dfg.idMap, full: true }); + } +} + +function mapDataFrameCreate( + args: readonly RFunctionArgument[], + info: ResolveInfo +): DataFrameInfo { + const columnArgs = getEffectiveArgs('data.frame', args); + + const argNames = columnArgs.map(arg => arg ? resolveIdToArgName(arg, info) : undefined).map(unescapeArgument); + const argLengths = columnArgs.map(arg => arg ? resolveIdToArgVectorLength(arg, info) : undefined); + const colnames = argNames.map(arg => isValidColName(arg) ? arg : undefined); + const rows = argLengths.every(arg => arg !== undefined) ? Math.max(...argLengths, 0) : undefined; + + return { + type: 'expression', + operations: [{ + operation: 'create', + operand: undefined, + args: { colnames, rows } + }] + }; +} + +function mapDataFrameUnknownCreate(): DataFrameInfo { + return { + type: 'expression', + operations: [{ + operation: 'unknown', + operand: undefined, + args: { creation: true } + }] + }; +} + +function mapDataFrameColBind( + args: readonly RFunctionArgument[], + info: ResolveInfo +): DataFrameInfo | undefined { + const dataFrame = args.find(isDataFrameArgument); + + if(dataFrame === undefined || dataFrame === EmptyArgument || dataFrame.value === undefined) { + return; + } else if(args.length === 1) { + return { + type: 'expression', + operations: [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }] + }; + } + const result: DataFrameInfo = { type: 'expression', operations: [] }; + let operand: RNode | undefined = dataFrame.value; + let colnames: (string | undefined)[] | undefined = []; + + for(const arg of args) { + if(arg !== dataFrame && arg !== EmptyArgument) { + if(arg.value !== undefined && isDataFrameArgument(arg)) { + const other = arg.value.info.dataFrame?.domain?.get(arg.value.info.id) ?? DataFrameTop; + + result.operations.push({ + operation: 'concatCols', + operand: operand?.info.id, + args: { other: other } + }); + operand = undefined; + // Added columns are unknown if argument cannot be resolved to constant (vector-like) value + } else if(resolveIdToArgValue(arg, info) !== undefined) { + const colname = unescapeArgument(resolveIdToArgName(arg, info)); + colnames?.push(colname); + } else { + colnames = undefined; + } + } + } + if(colnames === undefined || colnames.length > 0) { + result.operations.push({ + operation: 'addCols', + operand: operand?.info.id, + args: { colnames: colnames } + }); + } + return result; +} + +function mapDataFrameRowBind( + args: readonly RFunctionArgument[], + info: ResolveInfo +): DataFrameInfo | undefined { + const dataFrame = args.find(isDataFrameArgument); + + if(dataFrame === undefined || dataFrame === EmptyArgument || dataFrame.value === undefined) { + return; + } else if(args.length === 1) { + return { + type: 'expression', + operations: [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }] + }; + } + const result: DataFrameInfo = { type: 'expression', operations: [] }; + let operand: RNode | undefined = dataFrame.value; + let rows: number | undefined = 0; + + for(const arg of args) { + if(arg !== dataFrame && arg !== EmptyArgument) { + if(arg.value !== undefined && isDataFrameArgument(arg)) { + const other = arg.value.info.dataFrame?.domain?.get(arg.value.info.id) ?? DataFrameTop; + + result.operations.push({ + operation: 'concatRows', + operand: operand?.info.id, + args: { other: other } + }); + operand = undefined; + // Number of added rows is unknown if arguments cannot be resolved to constant (vector-like) value + } else if(resolveIdToArgValue(arg, info) !== undefined) { + rows = rows !== undefined ? rows + 1 : undefined; + } else { + rows = undefined; + } + } + } + if(rows === undefined || rows > 0) { + result.operations.push({ + operation: 'addRows', + operand: operand?.info.id, + args: { rows: rows } + }); + } + return result; +} + +function getFunctionArguments( + node: RFunctionCall, + dfg: DataflowGraph +): readonly RFunctionArgument[] { + const vertex = dfg.getVertex(node.info.id); + + if(vertex?.tag === VertexType.FunctionCall && dfg.idMap !== undefined) { + const idMap = dfg.idMap; + + return vertex.args + .map(arg => arg === EmptyArgument ? arg : dfg.idMap?.get(arg.nodeId)) + .map(arg => arg === EmptyArgument || arg?.type === RType.Argument ? arg : toUnnamedArgument(arg, idMap)); + } + return node.arguments; +} + +function getEffectiveArgs( + funct: keyof typeof SpecialFunctionArgumentsMapper, + args: readonly RFunctionArgument[] +): readonly RFunctionArgument[] { + const ignoredArgs = SpecialFunctionArgumentsMapper[funct] ?? []; + + return args.filter(arg => arg === EmptyArgument || arg.name === undefined || !ignoredArgs.includes(arg.name.content)); +} + +function isDataFrameArgument( + arg: RFunctionArgument +): boolean { + if(arg === EmptyArgument || arg.value === undefined) { + return false; + } + return arg.value.info.dataFrame?.domain?.get(arg.value.info.id) !== undefined; +} + +function isValidColName(colname: string | undefined): boolean { + return colname !== undefined && ColNamesRegex.test(colname); +} + +function unescapeArgument(argument: undefined): undefined; +function unescapeArgument(argument: string): string; +function unescapeArgument(argument: string | undefined): string | undefined; +function unescapeArgument(argument: string | undefined): string | undefined { + if(argument === undefined) { + return undefined; + } else if(startAndEndsWith(argument, '`') || startAndEndsWith(argument, '"') || startAndEndsWith(argument, '\'')) { + return argument.slice(1, -1); + } + return argument; +} diff --git a/src/abstract-interpretation/data-frame/processor.ts b/src/abstract-interpretation/data-frame/processor.ts index a5568af3ce..602f62c674 100644 --- a/src/abstract-interpretation/data-frame/processor.ts +++ b/src/abstract-interpretation/data-frame/processor.ts @@ -1,44 +1,198 @@ +import type { BuiltInMappingName } from '../../dataflow/environments/built-in'; +import { DefaultBuiltinConfig } from '../../dataflow/environments/default-builtin-config'; +import { EdgeType } from '../../dataflow/graph/edge'; import { type DataflowGraph } from '../../dataflow/graph/graph'; -import type { RNode } from '../../r-bridge/lang-4.x/ast/model/model'; +import type { NoInfo, RNode, RSingleNode } from '../../r-bridge/lang-4.x/ast/model/model'; +import type { RAccess } from '../../r-bridge/lang-4.x/ast/model/nodes/r-access'; +import type { RArgument } from '../../r-bridge/lang-4.x/ast/model/nodes/r-argument'; import type { RBinaryOp } from '../../r-bridge/lang-4.x/ast/model/nodes/r-binary-op'; import type { RFunctionCall } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import { EmptyArgument } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { RIfThenElse } from '../../r-bridge/lang-4.x/ast/model/nodes/r-if-then-else'; +import type { RPipe } from '../../r-bridge/lang-4.x/ast/model/nodes/r-pipe'; +import type { RUnaryOp } from '../../r-bridge/lang-4.x/ast/model/nodes/r-unary-op'; import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; import type { RFalse, RTrue } from '../../r-bridge/lang-4.x/convert-values'; -import type { DataFrameStateDomain } from './domain'; +import type { AbstractInterpretationInfo } from './absint-info'; +import type { DataFrameDomain, DataFrameStateDomain } from './domain'; +import { DataFrameTop, joinDataFrames } from './domain'; +import { applySemantics, ConstraintType, getConstraintTypes } from './semantics'; +import { mapDataFrameSemantics } from './semantics-mapper'; export type ConditionalDataFrameState = Record<'FD' | typeof RTrue | typeof RFalse, DataFrameStateDomain>; -export function processDataFrameNode( - node: RNode, +type ROperation = RFunctionCall | RUnaryOp | RBinaryOp | RAccess; +type RComplexNode = Exclude, RSingleNode>; + +type DataFrameProcessor> = ( + node: Node, + domain: DataFrameStateDomain, + dfg: DataflowGraph +) => DataFrameStateDomain; + +type DataFrameProcessorMapping = { + [Node in RComplexNode as Node['type']]: DataFrameProcessor; +} + +const DataFrameProcessorMapper: DataFrameProcessorMapping = { + [RType.ExpressionList]: processDataFrameNothing, + [RType.FunctionCall]: processDataFrameOperation, + [RType.UnaryOp]: processDataFrameOperation, + [RType.BinaryOp]: processDataFrameOperation, + [RType.Access]: processDataFrameOperation, + [RType.Pipe]: processDataFramePipe, + [RType.Argument]: processDataFrameArgument, + [RType.IfThenElse]: processDataFrameIfThenElse, + [RType.ForLoop]: processDataFrameNothing, + [RType.RepeatLoop]: processDataFrameNothing, + [RType.WhileLoop]: processDataFrameNothing, + [RType.FunctionDefinition]: processDataFrameNothing, + [RType.Parameter]: processDataFrameNothing +}; + +export function processDataFrameLeaf( + node: RSingleNode, + domain: DataFrameStateDomain, + dfg: DataflowGraph +): DataFrameStateDomain { + if(node.type === RType.Symbol) { + resolveIdToAbstractValue(node.info.id, domain, dfg); + } + updateDomainOfId(node, domain, dfg); + return domain; +} + +export function processDataFrameExpression>( + node: Node, + domain: DataFrameStateDomain, + dfg: DataflowGraph +): DataFrameStateDomain { + const nodeType: Node['type'] = node.type; + const processor = DataFrameProcessorMapper[nodeType] as DataFrameProcessor; + + const result = processor(node, domain, dfg); + updateDomainOfId(node, result, dfg); + + return result; +} + +function processDataFrameOperation( + node: ROperation, + domain: DataFrameStateDomain, + dfg: DataflowGraph +): DataFrameStateDomain { + const origin = DefaultBuiltinConfig.find(entry => entry.names.includes(node.lexeme)); + const processor = origin?.type === 'function' ? origin.processor as BuiltInMappingName : 'builtin:default'; + node.info.dataFrame = mapDataFrameSemantics(node, dfg, processor); + + if(node.info.dataFrame?.type === 'assignment') { + const value = resolveIdToAbstractValue(node.info.dataFrame.expression, domain, dfg); + + if(value !== undefined) { + domain.set(node.info.dataFrame.identifier, value); + updateDomainOfId(node.info.dataFrame.identifier, domain, dfg); + domain.set(node.info.id, value); + } + } else if(node.info.dataFrame?.type === 'expression') { + let value = DataFrameTop; + + for(const operation of node.info.dataFrame.operations) { + const operandValue = operation.operand ? resolveIdToAbstractValue(operation.operand, domain, dfg) : value; + value = applySemantics(operation.operation, operandValue ?? DataFrameTop, operation.args); + + if(operation.operand !== undefined && getConstraintTypes(operation.operation).some(type => type === ConstraintType.OperandPrecondition || type === ConstraintType.OperandModification)) { + assignAbstractValueToId(operation.operand, value, domain, dfg); + } + } + if(node.info.dataFrame.operations.some(operation => getConstraintTypes(operation.operation).includes(ConstraintType.ResultPostcondition))) { + domain.set(node.info.id, value); + } + } + return domain; +} + +function processDataFramePipe( + node: RPipe, domain: DataFrameStateDomain, dfg: DataflowGraph -): DataFrameStateDomain | ConditionalDataFrameState { - switch(node.type) { - case RType.BinaryOp: - return processBinaryOp(node, domain, dfg); - case RType.FunctionCall: - return processFunctionCall(node, domain, dfg); - default: - return domain; +): DataFrameStateDomain { + const value = resolveIdToAbstractValue(node.rhs.info.id, domain, dfg); + + if(value !== undefined) { + domain.set(node.info.id, value); } + return domain; } -function processBinaryOp( - node: RBinaryOp, +function processDataFrameArgument( + node: RArgument, domain: DataFrameStateDomain, dfg: DataflowGraph -): DataFrameStateDomain | ConditionalDataFrameState { - console.log(node, dfg.get(node.info.id), { lhs: dfg.get(node.lhs.info.id), rhs: dfg.get(node.rhs.info.id) }); +): DataFrameStateDomain { + if(node.value !== undefined) { + const value = resolveIdToAbstractValue(node.value.info.id, domain, dfg); + + if(value !== undefined) { + domain.set(node.info.id, value); + } + } return domain; } -function processFunctionCall( - node: RFunctionCall, +function processDataFrameIfThenElse( + node: RIfThenElse, domain: DataFrameStateDomain, dfg: DataflowGraph -): DataFrameStateDomain | ConditionalDataFrameState { - console.log(node, dfg.get(node.info.id), node.arguments.map(arg => arg !== EmptyArgument && arg.value ? dfg.get(arg.value?.info.id) : undefined)); +): DataFrameStateDomain { + const thenExit = node.then.children.at(-1); + const elseExit = node.otherwise?.children.at(-1); + const thenReturn = thenExit ? resolveIdToAbstractValue(thenExit.info.id, domain, dfg) : undefined; + const elseReturn = elseExit ? resolveIdToAbstractValue(elseExit.info.id, domain, dfg) : undefined; + + if(thenReturn !== undefined || elseReturn !== undefined) { + const returnValue = joinDataFrames(thenReturn ?? DataFrameTop, elseReturn ?? DataFrameTop); + domain.set(node.info.id, returnValue); + } + return domain; +} + +function processDataFrameNothing( + node: RComplexNode, + domain: DataFrameStateDomain +): DataFrameStateDomain { return domain; } + +function assignAbstractValueToId(id: NodeId, value: DataFrameDomain, domain: DataFrameStateDomain, dfg: DataflowGraph): void { + dfg.outgoingEdges(id)?.entries() + .filter(([, edge]) => edge.types === EdgeType.Reads) + .map(([id]) => id) + .forEach(origin => domain.set(origin, value)); +} + +function resolveIdToAbstractValue(id: NodeId, domain: DataFrameStateDomain, dfg: DataflowGraph): DataFrameDomain | undefined { + if(domain.has(id)) { + return domain.get(id); + } + const origins = dfg.outgoingEdges(id)?.entries() + .filter(([, edge]) => edge.types === EdgeType.Reads) + .map(([id]) => domain.get(id)) + .toArray(); + + if(origins !== undefined && origins.length > 0 && origins.some(origin => origin !== undefined)) { + const result = joinDataFrames(...origins.map(origin => origin ?? DataFrameTop)); + domain.set(id, result); + + return result; + } +} + +function updateDomainOfId(id: NodeId | RNode, domain: DataFrameStateDomain, dfg: DataflowGraph): void { + const node: RNode | undefined = typeof id === 'object' ? id : dfg.idMap?.get(id); + + if(node !== undefined) { + node.info.dataFrame ??= { type: 'other' }; + node.info.dataFrame.domain = new Map(domain); + } +} diff --git a/src/abstract-interpretation/data-frame/resolve-args.ts b/src/abstract-interpretation/data-frame/resolve-args.ts index 4d99ba0a2d..85dbff8fef 100644 --- a/src/abstract-interpretation/data-frame/resolve-args.ts +++ b/src/abstract-interpretation/data-frame/resolve-args.ts @@ -1,44 +1,103 @@ import type { ResolveInfo } from '../../dataflow/environments/resolve-by-name'; import { resolveIdToValue } from '../../dataflow/environments/resolve-by-name'; -import type { RNodeWithParent } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { RArgument } from '../../r-bridge/lang-4.x/ast/model/nodes/r-argument'; +import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; +import { unwrapRValue, unwrapRValueToString, unwrapRVector } from '../../util/r-value'; -export function resolveIdToArgName(id: NodeId | RNodeWithParent, { graph, idMap } : ResolveInfo): string | undefined { - idMap ??= graph?.idMap; - const node = typeof id === 'object' ? id : idMap?.get(id); +/** + * Returns the argument name of a function argument + */ +export function resolveIdToArgName(id: NodeId | RArgument, info: ResolveInfo): string | undefined { + const node = resolveIdToArgument(id, info); - if(node?.type === RType.Argument) { - return node.name?.content; + return node?.name?.content; +} + +/** + * Resolves the value of a function argument as string, number, boolean, string vector, number vector, boolean vector, or mixed vector using {@link resolveIdToValue} + */ +export function resolveIdToArgValue(id: NodeId | RArgument, info: ResolveInfo): string | number | boolean | string[] | number[] | boolean[] | (string | number | boolean)[] | undefined { + const node = resolveIdToArgument(id, info); + + if(node?.value !== undefined) { + const resolvedValue = resolveIdToValue(node.value, info); + + if(resolvedValue?.length === 1) { + if(Array.isArray(resolvedValue[0])) { + return unwrapRVector(resolvedValue[0]); + } else { + return unwrapRValue(resolvedValue[0]); + } + } } return undefined; } -export function resolveIdToArgValueSymbolName(id: NodeId | RNodeWithParent, { graph, idMap } : ResolveInfo): string | undefined { - idMap ??= graph?.idMap; - const node = typeof id === 'object' ? id : idMap?.get(id); +/** + * Resolves the value of a function argument to a string vector using {@link resolveIdToValue} and {@link unwrapRValueToString} + */ +export function resolveIdToArgStringVector(id: NodeId | RArgument, info: ResolveInfo): string[] | undefined { + const node = resolveIdToArgument(id, info); - if(node?.type === RType.Argument && node.value !== undefined) { - if(node.value.type === RType.Symbol) { - return node.value.content; - } else if(node.value.type === RType.String) { - return node.value.content.str; + if(node?.value !== undefined) { + const resolvedValue = resolveIdToValue(node.value, info); + + if(resolvedValue?.length === 1) { + if(Array.isArray(resolvedValue[0])) { + const array = resolvedValue[0].map(unwrapRValueToString); + return array.every(value => value !== undefined) ? array : undefined; + } else { + const value: unknown = resolvedValue[0]; + const result = unwrapRValueToString(value); + return result !== undefined ? [result] : undefined; + } } } return undefined; } -export function resolveIdToArgVectorLength(id: NodeId | RNodeWithParent, { graph, idMap, ...resolveInfo } : ResolveInfo): number | undefined { - idMap ??= graph?.idMap; - const node = typeof id === 'object' ? id : idMap?.get(id); +/** + * Returns the symbol name or string value of the value of a function argument + */ +export function resolveIdToArgValueSymbolName(id: NodeId | RArgument, info: ResolveInfo): string | undefined { + const node = resolveIdToArgument(id, info); - if(node?.type !== RType.Argument || node.value === undefined) { - return undefined; + if(node?.value?.type === RType.Symbol) { + return node.value.content; + } else if(node?.value?.type === RType.String) { + return node.value.content.str; } - const resolvedValue = resolveIdToValue(node.value, { graph, idMap, ...resolveInfo }); + return undefined; +} + +/** + * Resolves the vector length of the value of a function argument using {@link resolveIdToValue} + */ +export function resolveIdToArgVectorLength(id: NodeId | RArgument, info: ResolveInfo): number | undefined { + const node = resolveIdToArgument(id, info); + + if(node?.value !== undefined) { + const resolvedValue = resolveIdToValue(node.value, info); - if(resolvedValue?.length === 1) { - return Array.isArray(resolvedValue[0]) ? resolvedValue[0].length : undefined; + if(resolvedValue?.length === 1) { + if(Array.isArray(resolvedValue[0])) { + return resolvedValue[0].length; + } else if(unwrapRValue(resolvedValue[0]) !== undefined) { + return 1; + } + } + } + return undefined; +} + +function resolveIdToArgument(id: NodeId | RArgument, { graph, idMap }: ResolveInfo): RArgument | undefined { + idMap ??= graph?.idMap; + const node = typeof id === 'object' ? id : idMap?.get(id); + + if(node?.type === RType.Argument) { + return node; } return undefined; } diff --git a/src/abstract-interpretation/data-frame/semantics-mapper.ts b/src/abstract-interpretation/data-frame/semantics-mapper.ts new file mode 100644 index 0000000000..6fdc648e80 --- /dev/null +++ b/src/abstract-interpretation/data-frame/semantics-mapper.ts @@ -0,0 +1,38 @@ +import type { BuiltInMappingName } from '../../dataflow/environments/built-in'; +import type { DataflowGraph } from '../../dataflow/graph/graph'; +import type { RNode } from '../../r-bridge/lang-4.x/ast/model/model'; +import type { RAccess, RNamedAccess } from '../../r-bridge/lang-4.x/ast/model/nodes/r-access'; +import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import type { DataFrameInfo } from './absint-info'; +import { mapDataFrameAccess } from './mappers/access-mapper'; +import { mapDataFrameAssignment } from './mappers/assignment-mapper'; +import { mapDataFrameFunctionCall } from './mappers/function-mapper'; + +const DataFrameProcessorMapper = { + 'builtin:default': mapDataFrameFunctionCall, + 'builtin:assignment': mapDataFrameAssignment, + 'builtin:access': mapDataFrameAccess, +} as const satisfies Partial>; + +type DataFrameProcessor = ( + node: RNode, + dfg: DataflowGraph +) => DataFrameInfo | undefined; + +export function mapDataFrameSemantics( + node: RNode, + dfg: DataflowGraph, + origin: BuiltInMappingName +): DataFrameInfo | undefined { + if(origin in DataFrameProcessorMapper) { + const mapperFunction = DataFrameProcessorMapper[origin as keyof typeof DataFrameProcessorMapper]; + + return mapperFunction(node, dfg); + } +} + +export function isStringBasedAccess( + access: RAccess +): access is RNamedAccess { + return access.operator === '$' || access.operator === '@'; +} diff --git a/src/abstract-interpretation/data-frame/semantics.ts b/src/abstract-interpretation/data-frame/semantics.ts new file mode 100644 index 0000000000..68af248f7a --- /dev/null +++ b/src/abstract-interpretation/data-frame/semantics.ts @@ -0,0 +1,223 @@ +import type { DataFrameDomain } from './domain'; +import { addInterval, ColNamesTop, DataFrameTop, includeZeroInterval, IntervalTop, joinColNames, joinInterval, subtractColNames, subtractInterval } from './domain'; + +export enum ConstraintType { + /** The inferred constraints must hold for the operand at the point of the operation */ + OperandPrecondition, + /** The inferred constraints are applied to the operand during the operation */ + OperandModification, + /** The inferred constraints must hold for the returned result of the operation */ + ResultPostcondition +} + +type DataFrameSemanticsApplier = ( + value: DataFrameDomain, + args: Arguments +) => DataFrameDomain; + +type DataFrameSemanticsMapperInfo = { + readonly apply: DataFrameSemanticsApplier, + readonly types: ConstraintType[] +} + +const DataFrameSemanticsMapper = { + 'create': { apply: applyCreateSemantics, types: [ConstraintType.ResultPostcondition] }, + 'accessCol': { apply: applyAccessColSemantics, types: [ConstraintType.OperandPrecondition] }, + 'accessRow': { apply: applyAccessRowSemantics, types: [ConstraintType.OperandPrecondition] }, + 'assignCol': { apply: applyAssignColSemantics, types: [ConstraintType.OperandModification] }, + 'assignRow': { apply: applyAssignRowSemantics, types: [ConstraintType.OperandModification] }, + 'setColNames': { apply: applySetColNamesSemantics, types: [ConstraintType.OperandModification] }, + 'addCols': { apply: applyAddColsSemantics, types: [ConstraintType.ResultPostcondition] }, + 'addRows': { apply: applyAddRowsSemantics, types: [ConstraintType.ResultPostcondition] }, + 'removeCols': { apply: applyRemoveColsSemantics, types: [ConstraintType.ResultPostcondition] }, + 'removeRows': { apply: applyRemoveRowsSemantics, types: [ConstraintType.ResultPostcondition] }, + 'concatCols': { apply: applyConcatColsSemantics, types: [ConstraintType.ResultPostcondition] }, + 'concatRows': { apply: applyConcatRowsSemantics, types: [ConstraintType.ResultPostcondition] }, + 'identity': { apply: applyIdentitySemantics, types: [ConstraintType.ResultPostcondition] }, + 'unknown': { apply: applyUnknownSemantics, types: [ConstraintType.ResultPostcondition] } +} as const satisfies Record>; + +export type DataFrameOperationName = keyof typeof DataFrameSemanticsMapper; +export type DataFrameOperationArgs = Parameters[1]; + +export function applySemantics( + operation: Name, + value: DataFrameDomain, + args: DataFrameOperationArgs +): DataFrameDomain { + const applier = DataFrameSemanticsMapper[operation] as DataFrameSemanticsMapperInfo>; + + return applier.apply(value, args); +} + +export function getConstraintTypes(operation: DataFrameOperationName): ConstraintType[] { + return DataFrameSemanticsMapper[operation].types; +} + +function applyCreateSemantics( + value: DataFrameDomain, + { colnames, rows }: { colnames: (string | undefined)[], rows: number | undefined } +): DataFrameDomain { + return { + colnames: colnames.every(name => name !== undefined) ? colnames : ColNamesTop, + cols: [colnames.length, colnames.length], + rows: rows !== undefined ? [rows, rows] : IntervalTop + }; +} + +function applyAccessColSemantics( + value: DataFrameDomain, + { columns }: { columns: string[] | number[] | undefined } +): DataFrameDomain { + if(columns?.every(col => typeof col === 'string')) { + return { + ...value, + colnames: joinColNames(value.colnames, columns) + }; + } else if(columns?.every(col => typeof col === 'number')) { + return { + ...value, + cols: columns.reduce((a, b) => joinInterval(a, [b, b]), value.cols) + }; + } + return value; +} + +function applyAccessRowSemantics( + value: DataFrameDomain, + { rows }: { rows: number[] | undefined } +): DataFrameDomain { + if(rows !== undefined) { + return { + ...value, + rows: rows.reduce((a, b) => joinInterval(a, [b, b]), value.rows) + }; + } + return value; +} + +function applyAssignColSemantics( + value: DataFrameDomain, + { columns }: { columns: string[] | number[] | undefined } +): DataFrameDomain { + if(columns?.every(col => typeof col === 'string')) { + return { + ...value, + colnames: joinColNames(value.colnames, columns) + }; + } else if(columns?.every(col => typeof col === 'number')) { + return { + ...value, + cols: columns.reduce((a, b) => joinInterval(a, [b, b]), value.cols) + }; + } + return { + ...value, + colnames: ColNamesTop, + cols: IntervalTop + }; +} + +function applyAssignRowSemantics( + value: DataFrameDomain, + { rows }: { rows: number[] | undefined } +): DataFrameDomain { + if(rows !== undefined) { + return { + ...value, + rows: rows.reduce((a, b) => joinInterval(a, [b, b]), value.rows) + }; + } + return { + ...value, + rows: IntervalTop + }; +} + +function applySetColNamesSemantics( + value: DataFrameDomain, + { colnames }: { colnames: (string | undefined)[] | undefined } +): DataFrameDomain { + return { + ...value, + colnames: colnames?.every(name => name !== undefined) ? colnames : ColNamesTop, + cols: colnames !== undefined ? [colnames.length, colnames.length] : IntervalTop + }; +} + +function applyAddColsSemantics( + value: DataFrameDomain, + { colnames }: { colnames: (string | undefined)[] | undefined } +): DataFrameDomain { + return { + ...value, + colnames: colnames?.every(col => col !== undefined) ? joinColNames(value.colnames, colnames) : ColNamesTop, + cols: colnames !== undefined ? addInterval(value.cols, [colnames.length, colnames.length]) : IntervalTop + }; +} + +function applyAddRowsSemantics( + value: DataFrameDomain, + { rows }: { rows: number | undefined } +): DataFrameDomain { + return { + ...value, + rows: rows !== undefined ? addInterval(value.rows, [rows, rows]) : IntervalTop + }; +} + +function applyRemoveColsSemantics( + value: DataFrameDomain, + { colnames }: { colnames: (string | undefined)[] | undefined } +): DataFrameDomain { + return { + ...value, + colnames: colnames !== undefined ? subtractColNames(value.colnames, colnames.filter(col => col !== undefined)) : value.colnames, + cols: colnames !== undefined ? subtractInterval(value.cols, [colnames.length, colnames.length]) : includeZeroInterval(value.cols) + }; +} + +function applyRemoveRowsSemantics( + value: DataFrameDomain, + { rows }: { rows: number | undefined } +): DataFrameDomain { + return { + ...value, + rows: rows !== undefined ? subtractInterval(value.rows, [rows, rows]) : includeZeroInterval(value.rows) + }; +} + +function applyConcatColsSemantics( + value: DataFrameDomain, + { other }: { other: DataFrameDomain } +): DataFrameDomain { + return { + ...value, + colnames: joinColNames(value.colnames, other.colnames), + cols: addInterval(value.cols, other.cols) + }; +} + +function applyConcatRowsSemantics( + value: DataFrameDomain, + { other }: { other: DataFrameDomain } +): DataFrameDomain { + return { + ...value, + rows: addInterval(value.rows, other.rows) + }; +} + +function applyIdentitySemantics( + value: DataFrameDomain, + _args: Record +): DataFrameDomain { + return value; +} + +function applyUnknownSemantics( + _value: DataFrameDomain, + _args: { creation?: boolean, modifyInplace?: boolean } +): DataFrameDomain { + return DataFrameTop; +} diff --git a/src/cli/repl/commands/repl-cfg.ts b/src/cli/repl/commands/repl-cfg.ts index 49fe68249d..d56687cefd 100644 --- a/src/cli/repl/commands/repl-cfg.ts +++ b/src/cli/repl/commands/repl-cfg.ts @@ -5,8 +5,8 @@ import { fileProtocol, requestFromInput } from '../../../r-bridge/retriever'; import { cfgToMermaid, cfgToMermaidUrl } from '../../../util/mermaid/cfg'; import type { KnownParser } from '../../../r-bridge/parser'; import { ColorEffect, Colors, FontStyles } from '../../../util/ansi'; -import { extractSimpleCFG } from '../../../abstract-interpretation/simple-cfg'; -import { performDataFrameAbsint } from '../../../abstract-interpretation/data-frame/abstract-interpretation'; +import { mermaidCodeToUrl } from '../../../util/mermaid/mermaid'; +import { flipCfg, performDataFrameAbsint } from '../../../abstract-interpretation/data-frame/abstract-interpretation'; async function controlflow(parser: KnownParser, remainingLine: string) { return await createDataflowPipeline(parser, { @@ -67,13 +67,17 @@ export const absintDataFrameCommand: ReplCommand = { script: false, fn: async(output, shell, remainingLine) => { const result = await controlflow(shell, handleString(remainingLine)); - const cfg = extractSimpleCFG(result.normalize); - const mermaid = cfgToMermaidUrl(cfg, result.normalize); + + const cfg = extractCFG(result.normalize, result.dataflow.graph); + const forwardCfg = { ...cfg, graph: flipCfg(cfg.graph) }; + const mermaid = cfgToMermaid(forwardCfg, result.normalize).replace('flowchart BT', 'flowchart TB'); + const mermaidUrl = mermaidCodeToUrl(mermaid); + const domain = performDataFrameAbsint(cfg, result.dataflow.graph); + console.log(domain); try { const clipboard = await import('clipboardy'); - clipboard.default.writeSync(mermaid); + clipboard.default.writeSync(mermaidUrl); output.stdout(formatInfo(output, 'mermaid url')); } catch{ /* do nothing this is a service thing */ } - performDataFrameAbsint(cfg, result.dataflow.graph); } }; diff --git a/src/dataflow/environments/resolve-by-name.ts b/src/dataflow/environments/resolve-by-name.ts index 23eac76cc1..41e35243be 100644 --- a/src/dataflow/environments/resolve-by-name.ts +++ b/src/dataflow/environments/resolve-by-name.ts @@ -18,6 +18,7 @@ import { envFingerprint } from '../../slicing/static/fingerprint'; import { EdgeType } from '../graph/edge'; import { EmptyArgument } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; import type { RNumberValue } from '../../r-bridge/lang-4.x/convert-values'; +import { isRNumberValue } from '../../util/r-value'; const FunctionTargetTypes = ReferenceType.Function | ReferenceType.BuiltInFunction | ReferenceType.Unknown | ReferenceType.Argument | ReferenceType.Parameter; @@ -385,7 +386,3 @@ function createNumberSequence(start: RNumberValue, end: RNumberValue): RNumberVa return sequence.map(value => ({ ...start, num: value })); } - -function isRNumberValue(value: unknown): value is RNumberValue { - return typeof value === 'object' && value !== null && 'num' in value && typeof value.num === 'number'; -} \ No newline at end of file diff --git a/src/util/cfg/cfg.ts b/src/util/cfg/cfg.ts index 9907fb1551..d5787e09f4 100644 --- a/src/util/cfg/cfg.ts +++ b/src/util/cfg/cfg.ts @@ -86,6 +86,16 @@ export class ControlFlowGraph { return this; } + ingoing(node: NodeId): ReadonlyMap | undefined { + const edges = new Map(); + for(const [source, outgoing] of this.edgeInformation.entries()) { + if(outgoing.has(node)) { + edges.set(source, outgoing.get(node) as CfgEdge); + } + } + return edges; + } + outgoing(node: NodeId): ReadonlyMap | undefined { return this.edgeInformation.get(node); } @@ -201,11 +211,11 @@ function cfgLeaf(type: CfgVertexType): (leaf: RNodeWithParent) => ControlFlowInf } function cfgBreak(leaf: RNodeWithParent): ControlFlowInformation { - return { ...cfgLeaf(CfgVertexType.Statement)(leaf), breaks: [leaf.info.id] }; + return { ...cfgLeaf(CfgVertexType.Statement)(leaf), breaks: [leaf.info.id], exitPoints: [] }; } function cfgNext(leaf: RNodeWithParent): ControlFlowInformation { - return { ...cfgLeaf(CfgVertexType.Statement)(leaf), nexts: [leaf.info.id] }; + return { ...cfgLeaf(CfgVertexType.Statement)(leaf), nexts: [leaf.info.id], exitPoints: [] }; } function cfgIgnore(_leaf: RNodeWithParent): ControlFlowInformation { @@ -544,31 +554,46 @@ function cfgBinaryOp(binOp: RBinaryOp | RPipe, name: ControlFlowInformation, accessors: readonly (ControlFlowInformation | typeof EmptyArgument)[]): ControlFlowInformation { - const result = name; - const graph = result.graph; + const graph = name.graph; + const info = { graph, breaks: [...name.breaks], nexts: [...name.nexts], returns: [...name.returns], exitPoints: [access.info.id + '-exit'], entryPoints: [access.info.id] }; + graph.addVertex({ id: access.info.id, name: access.type, type: CfgVertexType.Expression }); - graph.addVertex({ id: access.info.id + '-exit', name: 'access-exit', type: CfgVertexType.EndMarker }); - for(const entry of name.entryPoints) { - graph.addEdge(entry, access.info.id, { label: 'FD' }); + + for(const entryPoint of name.entryPoints) { + graph.addEdge(entryPoint, access.info.id, { label: 'FD' }); } - for(const exit of name.exitPoints) { - graph.addEdge(access.info.id, exit, { label: 'FD' }); + + graph.addVertex({ id: access.info.id + '-name', name: 'access-name', type: CfgVertexType.MidMarker }); + + for(const exitPoint of name.exitPoints) { + graph.addEdge(access.info.id + '-name', exitPoint, { label: 'FD' }); } - result.entryPoints = [access.info.id]; - result.exitPoints = [access.info.id + '-exit']; + graph.addVertex({ id: access.info.id + '-exit', name: 'access-exit', type: CfgVertexType.EndMarker }); + + let lastArgExits: NodeId[] = [access.info.id + '-name']; + for(const accessor of accessors) { if(accessor === EmptyArgument) { continue; } graph.merge(accessor.graph); + info.breaks.push(...accessor.breaks); + info.nexts.push(...accessor.nexts); + info.returns.push(...accessor.returns); for(const entry of accessor.entryPoints) { - graph.addEdge(entry, access.info.id, { label: 'FD' }); - } - for(const exit of accessor.exitPoints) { - graph.addEdge(access.info.id + '-exit', exit, { label: 'FD' }); + for(const exit of lastArgExits) { + graph.addEdge(entry, exit, { label: 'FD' }); + } } + + lastArgExits = accessor.exitPoints; } - return result; + + for(const exit of lastArgExits) { + graph.addEdge(access.info.id + '-exit', exit, { label: 'FD' }); + } + + return info; } function cfgUnaryOp(unary: RNodeWithParent, operand: ControlFlowInformation): ControlFlowInformation { diff --git a/src/util/r-value.ts b/src/util/r-value.ts new file mode 100644 index 0000000000..b29f420a59 --- /dev/null +++ b/src/util/r-value.ts @@ -0,0 +1,74 @@ +import type { RLogicalValue } from '../r-bridge/lang-4.x/ast/model/nodes/r-logical'; +import { RFalse, RTrue, type RNumberValue, type RStringValue } from '../r-bridge/lang-4.x/convert-values'; + +function isRValue(value: unknown): value is RStringValue | RNumberValue | RLogicalValue | string | number { + return isRStringValue(value) || isRNumberValue(value) || isRLogicalValue(value) || typeof value === 'string' || typeof value === 'number'; +} + +export function isRStringValue(value: unknown): value is RStringValue { + return typeof value === 'object' && value !== null && 'str' in value && typeof value.str === 'string'; +} + +export function isRNumberValue(value: unknown): value is RNumberValue { + return typeof value === 'object' && value !== null && 'num' in value && typeof value.num === 'number'; +} + +export function isRLogicalValue(value: unknown): value is RLogicalValue { + return typeof value === 'boolean'; +} + +export function unwrapRValue(value: RStringValue | string): string; +export function unwrapRValue(value: RNumberValue | number): number; +export function unwrapRValue(value: RLogicalValue): boolean; +export function unwrapRValue(value: RStringValue | RNumberValue | RLogicalValue | string | number): string | number | boolean; +export function unwrapRValue(value: unknown): string | number | boolean | undefined; +export function unwrapRValue(value: RStringValue | RNumberValue | RLogicalValue | string | number | unknown): string | number | boolean | undefined { + if(typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') { + return value; + } else if(isRStringValue(value)) { + return value.str; + } else if(isRNumberValue(value)) { + return value.num; + } else { + return undefined; + } +} + +export function unwrapRVector(value: RStringValue[] | string[]): string[]; +export function unwrapRVector(value: RNumberValue[] | number[]): number[]; +export function unwrapRVector(value: RLogicalValue[]): boolean[]; +export function unwrapRVector(value: RStringValue[] | RNumberValue[] | RLogicalValue[] | string[] | number[]): string[] | number[] | boolean[]; +export function unwrapRVector(value: unknown): string[] | number[] | boolean[] | (string | number | boolean)[] | undefined; +export function unwrapRVector(value: RStringValue[] | RNumberValue[] | RLogicalValue[] | string[] | number[] | unknown): string[] | number[] | boolean[] | (string | number | boolean)[] | undefined { + if(!Array.isArray(value)) { + return undefined; + } else if(value.every(entry => typeof entry === 'string') || value.every(entry => typeof entry === 'number') || value.every(entry => typeof entry === 'boolean')) { + return value; + } else if(value.every(isRStringValue)) { + return value.map(entry => unwrapRValue(entry)); + } else if(value.every(isRNumberValue)) { + return value.map(entry => unwrapRValue(entry)); + } else if(value.every(isRValue)) { + return value.map(entry => unwrapRValue(entry)); + } else { + return undefined; + } +} + +export function unwrapRValueToString(value: RStringValue | RNumberValue | RLogicalValue | string | number): string; +export function unwrapRValueToString(value: unknown): string | undefined; +export function unwrapRValueToString(value: RStringValue | RNumberValue | RLogicalValue | string | number | unknown): string | undefined { + if(typeof value === 'string') { + return value; + } else if(typeof value === 'number') { + return value.toString(); + } else if(typeof value === 'boolean') { + return value ? RTrue : RFalse; + } else if(isRStringValue(value)) { + return value.str; + } else if(isRNumberValue(value)) { + return value.num.toString(); + } else { + return undefined; + } +} diff --git a/test/functionality/abstract-interpretation/data-frame/data-frame.ts b/test/functionality/abstract-interpretation/data-frame/data-frame.ts index 6c100415d2..9bbe0188b4 100644 --- a/test/functionality/abstract-interpretation/data-frame/data-frame.ts +++ b/test/functionality/abstract-interpretation/data-frame/data-frame.ts @@ -1,19 +1,23 @@ import { assert, beforeAll, test } from 'vitest'; +import type { AbstractInterpretationInfo } from '../../../../src/abstract-interpretation/data-frame/absint-info'; +import { performDataFrameAbsint } from '../../../../src/abstract-interpretation/data-frame/abstract-interpretation'; import type { DataFrameDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; import { DataFrameTop, leqColNames, leqInterval } from '../../../../src/abstract-interpretation/data-frame/domain'; import { PipelineExecutor } from '../../../../src/core/pipeline-executor'; import type { TREE_SITTER_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; import { createDataflowPipeline, DEFAULT_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; import type { PipelineOutput } from '../../../../src/core/steps/pipeline/pipeline'; +import type { RNode } from '../../../../src/r-bridge/lang-4.x/ast/model/model'; import type { RSymbol } from '../../../../src/r-bridge/lang-4.x/ast/model/nodes/r-symbol'; import type { ParentInformation } from '../../../../src/r-bridge/lang-4.x/ast/model/processing/decorate'; import { RType } from '../../../../src/r-bridge/lang-4.x/ast/model/type'; import type { KnownParser } from '../../../../src/r-bridge/parser'; import { requestFromInput } from '../../../../src/r-bridge/retriever'; import type { RShell } from '../../../../src/r-bridge/shell'; -import type { SingleSlicingCriterion, SlicingCriteria } from '../../../../src/slicing/criterion/parse'; +import type { SingleSlicingCriterion } from '../../../../src/slicing/criterion/parse'; import { slicingCriterionToId } from '../../../../src/slicing/criterion/parse'; import { assertUnreachable, guard, isNotUndefined } from '../../../../src/util/assert'; +import { extractCFG } from '../../../../src/util/cfg/cfg'; import { getRangeEnd } from '../../../../src/util/range'; import { decorateLabelContext, type TestLabel } from '../../_helper/label'; @@ -41,7 +45,8 @@ interface CriterionTestEntry { criterion: SingleSlicingCriterion, value: DataFrameDomain, node: RSymbol, - lineNumber: number + lineNumber: number, + options: DataFrameTestOptions } export function assertDataFrameDomain( @@ -69,13 +74,10 @@ export function assertDataFrameDomain( export function testDataFrameDomainAgainstReal( shell: RShell, code: string, - criteria: SlicingCriteria, - /** Whether the inferred properties should match exacly the actual properties or can be an over-approximation (defaults to exact for all properties) */ - options?: Partial, + /** The options describe whether the inferred properties should match exacly the actual properties or can be an over-approximation (defaults to exact for all properties) */ + criteria: (SingleSlicingCriterion | [SingleSlicingCriterion, Partial])[], name: string | TestLabel = code ): void { - const effectiveOptions = { ...DataFrameTestExact, ...options }; - test(decorateLabelContext(name, ['absint']), async()=> { const result = await new PipelineExecutor(DEFAULT_DATAFLOW_PIPELINE, { parser: shell, @@ -84,14 +86,16 @@ export function testDataFrameDomainAgainstReal( const testEntries: CriterionTestEntry[] = []; - for(const criterion of criteria) { + for(const entry of criteria) { + const criterion = Array.isArray(entry) ? entry[0] : entry; + const options = { ...DataFrameTestExact, ...(Array.isArray(entry) ? entry[1] : {}) }; const [value, node] = getInferredDomainForCriterion(result, criterion); const lineNumber = getRangeEnd(node.info.fullRange ?? node.location)?.[0]; if(lineNumber === undefined) { throw new Error(`cannot resolve line of criterion ${criterion}`); } - testEntries.push({ criterion, value, node, lineNumber }); + testEntries.push({ criterion, value, node, lineNumber, options }); } testEntries.sort((a, b) => b.lineNumber - a.lineNumber); const lines = code.split('\n'); @@ -102,21 +106,21 @@ export function testDataFrameDomainAgainstReal( createCodeForOutput('cols', criterion, node.content), createCodeForOutput('rows', criterion, node.content) ]; - lines.splice(lineNumber + 1, 0, ...outputCode); + lines.splice(lineNumber, 0, ...outputCode); } const instrumentedCode = lines.join('\n'); shell.clearEnvironment(); const output = await shell.sendCommandWithOutput(instrumentedCode); - for(const { criterion, value } of testEntries) { + for(const { criterion, value, options } of testEntries) { const colnames = getRealDomainFromOutput('colnames', criterion, output); const cols = getRealDomainFromOutput('cols', criterion, output); const rows = getRealDomainFromOutput('rows', criterion, output); - assertDomainMatching('colnames', value.colnames, colnames, leqColNames, effectiveOptions.colnames); - assertDomainMatching('cols', value.cols, cols, leqInterval, effectiveOptions.cols); - assertDomainMatching('rows', value.rows, rows, leqInterval, effectiveOptions.rows); + assertDomainMatching('colnames', value.colnames, colnames, leqColNames, options.colnames); + assertDomainMatching('cols', value.cols, cols, leqInterval, options.cols); + assertDomainMatching('rows', value.rows, rows, leqInterval, options.rows); } }); } @@ -161,12 +165,14 @@ function getInferredDomainForCriterion( ): [DataFrameDomain, RSymbol] { const idMap = result.dataflow.graph.idMap ?? result.normalize.idMap; const nodeId = slicingCriterionToId(criterion, idMap); - const node = idMap.get(nodeId); + const node: RNode | undefined = idMap.get(nodeId); if(node === undefined || node.type !== RType.Symbol) { throw new Error(`slicing criterion ${criterion} does not refer to a R symbol`); } - const value = DataFrameTop; + const cfg = extractCFG(result.normalize, result.dataflow.graph); + performDataFrameAbsint(cfg, result.dataflow.graph); + const value = node.info.dataFrame?.domain?.get(node.info.id) ?? DataFrameTop; return [value, node]; } diff --git a/test/functionality/abstract-interpretation/data-frame/domain.test.ts b/test/functionality/abstract-interpretation/data-frame/domain.test.ts index 569c3c903d..a69f23c25a 100644 --- a/test/functionality/abstract-interpretation/data-frame/domain.test.ts +++ b/test/functionality/abstract-interpretation/data-frame/domain.test.ts @@ -1,11 +1,14 @@ import { assert, describe, test } from 'vitest'; -import type { ColNamesDomain, IntervalDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; -import { ColNamesBottom, ColNamesTop, IntervalBottom, IntervalTop, joinColNames, joinInterval, leqColNames, leqInterval, meetColNames, meetInterval, subtractColNames } from '../../../../src/abstract-interpretation/data-frame/domain'; +import type { ColNamesDomain, DataFrameDomain, DataFrameStateDomain, IntervalDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; +import { ColNamesBottom, ColNamesTop, DataFrameBottom, DataFrameTop, equalColNames, equalDataFrameDomain, equalDataFrameState, equalInterval, IntervalBottom, IntervalTop, joinColNames, joinDataFrames, joinDataFrameStates, joinInterval, leqColNames, leqInterval, meetColNames, meetDataFrames, meetDataFrameStates, meetInterval, subtractColNames } from '../../../../src/abstract-interpretation/data-frame/domain'; describe('Data Frame Domain', () => { describe('Column Names Domain', () => { const toSet = (value: ColNamesDomain) => value === ColNamesTop ? value : new Set(value); - const check = (X1: ColNamesDomain, X2: ColNamesDomain, leq: boolean, join: ColNamesDomain, meet: ColNamesDomain, difference: ColNamesDomain) => { + const check = (X1: ColNamesDomain, X2: ColNamesDomain, equal: boolean, leq: boolean, join: ColNamesDomain, meet: ColNamesDomain, difference: ColNamesDomain) => { + test(`${JSON.stringify(X1)} = ${JSON.stringify(X2)}`, () => { + assert.strictEqual(equalColNames(X1, X2), equal); + }); test(`${JSON.stringify(X1)} ⊑ ${JSON.stringify(X2)}`, () => { assert.strictEqual(leqColNames(X1, X2), leq); }); @@ -19,23 +22,26 @@ describe('Data Frame Domain', () => { assert.deepStrictEqual(toSet(subtractColNames(X1, X2)), toSet(difference)); }); }; - check(ColNamesBottom, ColNamesBottom, true, ColNamesBottom, ColNamesBottom, ColNamesBottom); - check(ColNamesTop, ColNamesTop, true, ColNamesTop, ColNamesTop, ColNamesTop); - check(ColNamesBottom, ColNamesTop, true, ColNamesTop, ColNamesBottom, ColNamesBottom); - check(ColNamesTop, ColNamesBottom, false, ColNamesTop, ColNamesBottom, ColNamesTop); - check(ColNamesBottom, ['id', 'age'], true, ['id', 'age'], ColNamesBottom, ColNamesBottom); - check(['id', 'age'], ColNamesBottom, false, ['id', 'age'], ColNamesBottom, ['id', 'age']); - check(['id', 'age'], ['id', 'age'], true, ['id', 'age'], ['id', 'age'], ColNamesBottom); - check(['id', 'age'], ['id', 'age', 'score'], true, ['id', 'age', 'score'], ['id', 'age'], ColNamesBottom); - check(['id', 'age', 'score'], ['id', 'age'], false, ['id', 'age', 'score'], ['id', 'age'], ['score']); - check(['id', 'age', 'score'], ['id', 'category'], false, ['id', 'age', 'score', 'category'], ['id'], ['age', 'score']); - check(['id', 'category'], ['id', 'age', 'score'], false, ['id', 'age', 'score', 'category'], ['id'], ['category']); - check(['id', 'age'], ColNamesTop, true, ColNamesTop, ['id', 'age'], ['id', 'age']); - check(ColNamesTop, ['id', 'age'], false, ColNamesTop, ['id', 'age'], ColNamesTop); + check(ColNamesBottom, ColNamesBottom, true, true, ColNamesBottom, ColNamesBottom, ColNamesBottom); + check(ColNamesTop, ColNamesTop, true, true, ColNamesTop, ColNamesTop, ColNamesTop); + check(ColNamesBottom, ColNamesTop, false, true, ColNamesTop, ColNamesBottom, ColNamesBottom); + check(ColNamesTop, ColNamesBottom, false, false, ColNamesTop, ColNamesBottom, ColNamesTop); + check(ColNamesBottom, ['id', 'age'], false, true, ['id', 'age'], ColNamesBottom, ColNamesBottom); + check(['id', 'age'], ColNamesBottom, false, false, ['id', 'age'], ColNamesBottom, ['id', 'age']); + check(['id', 'age'], ['age', 'id'], true, true, ['id', 'age'], ['id', 'age'], ColNamesBottom); + check(['id', 'age'], ['id', 'age', 'score'], false, true, ['id', 'age', 'score'], ['id', 'age'], ColNamesBottom); + check(['id', 'age', 'score'], ['id', 'age'], false, false, ['id', 'age', 'score'], ['id', 'age'], ['score']); + check(['id', 'age', 'score'], ['id', 'category'], false, false, ['id', 'age', 'score', 'category'], ['id'], ['age', 'score']); + check(['id', 'category'], ['id', 'age', 'score'], false, false, ['id', 'age', 'score', 'category'], ['id'], ['category']); + check(['id', 'age'], ColNamesTop, false, true, ColNamesTop, ['id', 'age'], ['id', 'age']); + check(ColNamesTop, ['id', 'age'], false, false, ColNamesTop, ['id', 'age'], ColNamesTop); }); describe('Interval Domain', () => { - const check = (X1: IntervalDomain, X2: IntervalDomain, leq: boolean, join: IntervalDomain, meet: IntervalDomain) => { + const check = (X1: IntervalDomain, X2: IntervalDomain, equal: boolean, leq: boolean, join: IntervalDomain, meet: IntervalDomain) => { + test(`${JSON.stringify(X1)} = ${JSON.stringify(X2)}`, () => { + assert.strictEqual(equalInterval(X1, X2), equal); + }); test(`${JSON.stringify(X1)} ⊑ ${JSON.stringify(X2)}`, () => { assert.strictEqual(leqInterval(X1, X2), leq); }); @@ -46,35 +52,122 @@ describe('Data Frame Domain', () => { assert.deepStrictEqual(meetInterval(X1, X2), meet); }); }; - check(IntervalBottom, IntervalBottom, true, IntervalBottom, IntervalBottom); - check(IntervalTop, IntervalTop, true, IntervalTop, IntervalTop); - check(IntervalBottom, IntervalTop, true, IntervalTop, IntervalBottom); - check(IntervalTop, IntervalBottom, false, IntervalTop, IntervalBottom); - check(IntervalBottom, [2, 2], true, [2, 2], IntervalBottom); - check([2, 2], IntervalBottom, false, [2, 2], IntervalBottom); - check(IntervalBottom, [2, 8], true, [2, 8], IntervalBottom); - check([2, 8], IntervalBottom, false, [2, 8], IntervalBottom); - check([2, 8], [0, 4], false, [0, 8], [2, 4]); - check([0, 4], [2, 8], false, [0, 8], [2, 4]); - check([2, 8], [4, 12], false, [2, 12], [4, 8]); - check([4, 12], [2, 8], false, [2, 12], [4, 8]); - check([2, 8], [8, Infinity], false, [2, Infinity], [8, 8]); - check([8, Infinity], [2, 8], false, [2, Infinity], [8, 8]); - check([2, 8], [2, 4], false, [2, 8], [2, 4]); - check([2, 4], [2, 8], true, [2, 8], [2, 4]); - check([2, 8], [2, 2], false, [2, 8], [2, 2]); - check([2, 2], [2, 8], true, [2, 8], [2, 2]); - check([2, 8], [0, 0], false, [0, 8], IntervalBottom); - check([0, 0], [2, 8], false, [0, 8], IntervalBottom); - check([2, 8], [10, 12], false, [2, 12], IntervalBottom); - check([10, 12], [2, 8], false, [2, 12], IntervalBottom); - check([0, 0], [12, Infinity], false, IntervalTop, IntervalBottom); - check([12, Infinity], [0, 0], false, IntervalTop, IntervalBottom); - check([4, Infinity], [12, Infinity], false, [4, Infinity], [12, Infinity]); - check([12, Infinity], [4, Infinity], true, [4, Infinity], [12, Infinity]); - check([2, 8], IntervalTop, true, IntervalTop, [2, 8]); - check(IntervalTop, [2, 8], false, IntervalTop, [2, 8]); - check([12, Infinity], IntervalTop, true, IntervalTop, [12, Infinity]); - check(IntervalTop, [12, Infinity], false, IntervalTop, [12, Infinity]); + check(IntervalBottom, IntervalBottom, true, true, IntervalBottom, IntervalBottom); + check(IntervalTop, IntervalTop, true, true, IntervalTop, IntervalTop); + check(IntervalBottom, IntervalTop, false, true, IntervalTop, IntervalBottom); + check(IntervalTop, IntervalBottom, false, false, IntervalTop, IntervalBottom); + check(IntervalBottom, [2, 2], false, true, [2, 2], IntervalBottom); + check([2, 2], IntervalBottom, false, false, [2, 2], IntervalBottom); + check(IntervalBottom, [2, 8], false, true, [2, 8], IntervalBottom); + check([2, 8], IntervalBottom, false, false, [2, 8], IntervalBottom); + check([2, 8], [2, 8], true, true, [2, 8], [2, 8]); + check([2, 8], [0, 4], false, false, [0, 8], [2, 4]); + check([0, 4], [2, 8], false, false, [0, 8], [2, 4]); + check([2, 8], [4, 12], false, false, [2, 12], [4, 8]); + check([4, 12], [2, 8], false, false, [2, 12], [4, 8]); + check([2, 8], [8, Infinity], false, false, [2, Infinity], [8, 8]); + check([8, Infinity], [2, 8], false, false, [2, Infinity], [8, 8]); + check([2, 8], [2, 4], false, false, [2, 8], [2, 4]); + check([2, 4], [2, 8], false, true, [2, 8], [2, 4]); + check([2, 8], [2, 2], false, false, [2, 8], [2, 2]); + check([2, 2], [2, 8], false, true, [2, 8], [2, 2]); + check([2, 8], [0, 0], false, false, [0, 8], IntervalBottom); + check([0, 0], [2, 8], false, false, [0, 8], IntervalBottom); + check([2, 8], [10, 12], false, false, [2, 12], IntervalBottom); + check([10, 12], [2, 8], false, false, [2, 12], IntervalBottom); + check([0, 0], [12, Infinity], false, false, IntervalTop, IntervalBottom); + check([12, Infinity], [0, 0], false, false, IntervalTop, IntervalBottom); + check([4, Infinity], [12, Infinity], false, false, [4, Infinity], [12, Infinity]); + check([12, Infinity], [4, Infinity], false, true, [4, Infinity], [12, Infinity]); + check([2, 8], IntervalTop, false, true, IntervalTop, [2, 8]); + check(IntervalTop, [2, 8], false, false, IntervalTop, [2, 8]); + check([12, Infinity], IntervalTop, false, true, IntervalTop, [12, Infinity]); + check(IntervalTop, [12, Infinity], false, false, IntervalTop, [12, Infinity]); + }); + + describe('Data Frame Domain', () => { + const check = (X1: DataFrameDomain, X2: DataFrameDomain, equal: boolean, join: DataFrameDomain, meet: DataFrameDomain) => { + test(`${JSON.stringify(X1)} = ${JSON.stringify(X2)}`, () => { + assert.strictEqual(equalDataFrameDomain(X1, X2), equal); + }); + test(`${JSON.stringify(X1)} ⊔ ${JSON.stringify(X2)}`, () => { + const result = joinDataFrames(X1, X2); + assert.isTrue(equalDataFrameDomain(result, join), `expected domain ${JSON.stringify(result)} to equal domain ${JSON.stringify(join)}`); + }); + test(`${JSON.stringify(X1)} ⊓ ${JSON.stringify(X2)}`, () => { + const result = meetDataFrames(X1, X2); + assert.isTrue(equalDataFrameDomain(result, meet), `expected domain ${JSON.stringify(result)} to equal domain ${JSON.stringify(meet)}`); + }); + }; + const join = (X1: DataFrameDomain, X2: DataFrameDomain): DataFrameDomain => { + return { colnames: joinColNames(X1.colnames, X2.colnames), cols: joinInterval(X1.cols, X2.cols), rows: joinInterval(X1.rows, X2.rows) }; + }; + const meet = (X1: DataFrameDomain, X2: DataFrameDomain): DataFrameDomain => { + return { colnames: meetColNames(X1.colnames, X2.colnames), cols: meetInterval(X1.cols, X2.cols), rows: meetInterval(X1.rows, X2.rows) }; + }; + const domain1: DataFrameDomain = { colnames: ['id', 'name', 'age'], cols: [3, 5], rows: [5, 5] }; + const domain2: DataFrameDomain = { colnames: ['id', 'category'], cols: [2, 2], rows: [0, 6] }; + + check(DataFrameBottom, DataFrameBottom, true, DataFrameBottom, DataFrameBottom); + check(DataFrameTop, DataFrameTop, true, DataFrameTop, DataFrameTop); + check(DataFrameBottom, DataFrameTop, false, DataFrameTop, DataFrameBottom); + check(DataFrameTop, DataFrameBottom, false, DataFrameTop, DataFrameBottom); + check(DataFrameBottom, domain1, false, domain1, DataFrameBottom); + check(domain1, DataFrameBottom, false, domain1, DataFrameBottom); + check(domain1, domain1, true, domain1, domain1); + check(domain1, { ...domain1, colnames: ColNamesTop }, false, { ...domain1, colnames: ColNamesTop }, domain1); + check({ ...domain1, colnames: ColNamesTop }, domain1, false, { ...domain1, colnames: ColNamesTop }, domain1); + check(domain1, { ...domain1, cols: IntervalTop }, false, { ...domain1, cols: IntervalTop }, domain1); + check({ ...domain1, cols: IntervalTop }, domain1, false, { ...domain1, cols: IntervalTop }, domain1); + check(domain1, { ...domain1, rows: IntervalTop }, false, { ...domain1, rows: IntervalTop }, domain1); + check({ ...domain1, rows: IntervalTop }, domain1, false, { ...domain1, rows: IntervalTop }, domain1); + check(domain1, { ...domain1, colnames: ColNamesBottom }, false, domain1, { ...domain1, colnames: ColNamesBottom }); + check({ ...domain1, colnames: ColNamesBottom }, domain1, false, domain1, { ...domain1, colnames: ColNamesBottom }); + check(domain1, { ...domain1, cols: IntervalBottom }, false, domain1, { ...domain1, cols: IntervalBottom }); + check({ ...domain1, cols: IntervalBottom }, domain1, false, domain1, { ...domain1, cols: IntervalBottom }); + check(domain1, { ...domain1, rows: IntervalBottom }, false, domain1, { ...domain1, rows: IntervalBottom }); + check({ ...domain1, rows: IntervalBottom }, domain1, false, domain1, { ...domain1, rows: IntervalBottom }); + check(domain1, domain2, false, join(domain1, domain2), meet(domain1, domain2)); + check(domain2, domain1, false, join(domain2, domain1), meet(domain2, domain1)); + check(DataFrameTop, domain1, false, DataFrameTop, domain1); + check(domain1, DataFrameTop, false, DataFrameTop, domain1); + }); + + describe('Data Frame State Domain', () => { + const toString = (state: DataFrameStateDomain) => JSON.stringify(Object.fromEntries(state)); + const check = (R1: DataFrameStateDomain, R2: DataFrameStateDomain, equal: boolean, join: DataFrameStateDomain, meet: DataFrameStateDomain) => { + test(`${toString(R1)} = ${toString(R2)}`, () => { + assert.strictEqual(equalDataFrameState(R1, R2), equal); + }); + test(`${toString(R1)} ⊔ ${toString(R2)}`, () => { + const result = joinDataFrameStates(R1, R2); + assert.isTrue(equalDataFrameState(result, join), `expected state ${toString(result)} to equal state ${toString(join)}`); + }); + test(`${toString(R1)} ⊓ ${toString(R2)}`, () => { + const result = meetDataFrameStates(R1, R2); + assert.isTrue(equalDataFrameState(result, meet), `expected state ${toString(result)} to equal state ${toString(meet)}`); + }); + }; + const domain1: DataFrameDomain = { colnames: ['id', 'name', 'age'], cols: [3, 5], rows: [5, 5] }; + const domain2: DataFrameDomain = { colnames: ['id', 'category'], cols: [2, 2], rows: [0, 6] }; + + check(new Map([[0, DataFrameBottom]]), new Map([[0, DataFrameBottom]]), true, new Map([[0, DataFrameBottom]]), new Map([[0, DataFrameBottom]])); + check(new Map([[0, DataFrameTop]]), new Map([[0, DataFrameTop]]), true, new Map([[0, DataFrameTop]]), new Map([[0, DataFrameTop]])); + check(new Map([[0, DataFrameBottom]]), new Map([[0, DataFrameTop]]), false, new Map([[0, DataFrameTop]]), new Map([[0, DataFrameBottom]])); + check(new Map([[0, DataFrameTop]]), new Map([[0, DataFrameBottom]]), false, new Map([[0, DataFrameTop]]), new Map([[0, DataFrameBottom]])); + check(new Map([[0, DataFrameBottom]]), new Map([[0, domain1]]), false, new Map([[0, domain1]]), new Map([[0, DataFrameBottom]])); + check(new Map([[0, domain1]]), new Map([[0, DataFrameBottom]]), false, new Map([[0, domain1]]), new Map([[0, DataFrameBottom]])); + check(new Map([[0, domain1]]), new Map([[0, domain1]]), true, new Map([[0, domain1]]), new Map([[0, domain1]])); + check(new Map([[0, domain1]]), new Map([[0, domain2]]), false, new Map([[0, joinDataFrames(domain1, domain2)]]), new Map([[0, meetDataFrames(domain1, domain2)]])); + check(new Map([[0, domain2]]), new Map([[0, domain1]]), false, new Map([[0, joinDataFrames(domain2, domain1)]]), new Map([[0, meetDataFrames(domain2, domain1)]])); + check(new Map([[0, domain1], [1, domain2]]), new Map([[0, domain1], [1, domain2]]), true, new Map([[0, domain1], [1, domain2]]), new Map([[0, domain1], [1, domain2]])); + check(new Map([[1, DataFrameTop]]), new Map([[0, domain1], [1, domain2]]), false, new Map([[0, domain1], [1, DataFrameTop]]), new Map([[0, domain1], [1, domain2]])); + check(new Map([[0, domain1], [1, domain2]]), new Map([[1, DataFrameTop]]), false, new Map([[0, domain1], [1, DataFrameTop]]), new Map([[0, domain1], [1, domain2]])); + check(new Map([[0, domain1], [1, domain2]]), new Map([[0, DataFrameTop], [1, DataFrameBottom]]), false, new Map([[0, DataFrameTop], [1, domain2]]), new Map([[0, domain1], [1, DataFrameBottom]])); + check(new Map([[0, DataFrameTop], [1, DataFrameBottom]]), new Map([[0, domain1], [1, domain2]]), false, new Map([[0, DataFrameTop], [1, domain2]]), new Map([[0, domain1], [1, DataFrameBottom]])); + check(new Map([[0, domain1], [2, DataFrameBottom]]), new Map([[1, DataFrameTop]]), false, new Map([[0, domain1], [1, DataFrameTop], [2, DataFrameBottom]]), new Map([[0, domain1], [1, DataFrameTop], [2, DataFrameBottom]])); + check(new Map([[1, DataFrameTop]]), new Map([[0, domain1], [2, DataFrameBottom]]), false, new Map([[0, domain1], [1, DataFrameTop], [2, DataFrameBottom]]), new Map([[0, domain1], [1, DataFrameTop], [2, DataFrameBottom]])); + check(new Map([[0, DataFrameTop]]), new Map([[0, domain1]]), false, new Map([[0, DataFrameTop]]), new Map([[0, domain1]])); + check(new Map([[0, domain1]]), new Map([[0, DataFrameTop]]), false, new Map([[0, DataFrameTop]]), new Map([[0, domain1]])); }); }); diff --git a/test/functionality/abstract-interpretation/data-frame/inference.test.ts b/test/functionality/abstract-interpretation/data-frame/inference.test.ts index 4c0dca0330..7fd283aaf0 100644 --- a/test/functionality/abstract-interpretation/data-frame/inference.test.ts +++ b/test/functionality/abstract-interpretation/data-frame/inference.test.ts @@ -1,133 +1,263 @@ import { describe } from 'vitest'; +import type { DataFrameDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; +import { ColNamesTop, DataFrameTop, IntervalTop } from '../../../../src/abstract-interpretation/data-frame/domain'; +import type { SingleSlicingCriterion } from '../../../../src/slicing/criterion/parse'; import { withShell } from '../../_helper/shell'; -import { ColNamesTop, DataFrameTop } from '../../../../src/abstract-interpretation/data-frame/domain'; -import { testDataFrameDomainAgainstReal, assertDataFrameDomain, DomainMatchingType, DataFrameTestOverapproximation } from './data-frame'; +import type { DataFrameTestOptions } from './data-frame'; +import { assertDataFrameDomain, DataFrameTestOverapproximation, DomainMatchingType, testDataFrameDomainAgainstReal } from './data-frame'; -describe.sequential('Data Frame Abstract Interpretation', { skip: true }, withShell(shell => { - assertDataFrameDomain( - shell, - 'df <- data.frame(id = 1:5, age = c(25, 32, 35, 40, 45), score = c(90, 85, 88, 92, 95), row.names = NULL)', - [['1@df', { colnames: ['id', 'age', 'score'], cols: [3, 3], rows: [5, 5] }]] - ); +describe.sequential('Data Frame Abstract Interpretation', withShell(shell => { + function testDataFrameDomain( + code: string, + criteria: ([SingleSlicingCriterion, DataFrameDomain] | [SingleSlicingCriterion, DataFrameDomain, Partial])[] + ) { + assertDataFrameDomain(shell, code, criteria.map(entry => [entry[0], entry[1]])); + testDataFrameDomainAgainstReal(shell, code, criteria.map(entry => entry.length === 3 ? [entry[0], entry[2]] : entry[0])); + } - testDataFrameDomainAgainstReal( - shell, + testDataFrameDomain( 'df <- data.frame(id = 1:5, age = c(25, 32, 35, 40, 45), score = c(90, 85, 88, 92, 95), row.names = NULL)', - ['1@df'] + [['1@df', { colnames: ['id', 'age', 'score'], cols: [3, 3], rows: [5, 5] }]] ); - assertDataFrameDomain( - shell, + testDataFrameDomain( 'df <- data.frame(id = c(1, 2, 3, 5, 6, 7), category = c("A", "B", "A", "A", "B", "B"))', [['1@df', { colnames: ['id', 'category'], cols: [2, 2], rows: [6, 6] }]] ); - testDataFrameDomainAgainstReal( - shell, - 'df <- data.frame(id = c(1, 2, 3, 5, 6, 7), category = c("A", "B", "A", "A", "B", "B"))', - ['1@df'] + testDataFrameDomain( + 'df <- data.frame(c(1, 2, 3:5, c(6, 7, c(8, 9))), c("a", "b", "c"))', + [['1@df', { colnames: ColNamesTop, cols: [2, 2], rows: [9, 9] }, { colnames: DomainMatchingType.Overapproximation }]] ); - assertDataFrameDomain( - shell, - 'df <- data.frame(c(1, 2, 3:5, c(6, 7, c(8, 9))), c("a", "b", "c"))', - [['1@df', { colnames: ColNamesTop, cols: [2, 2], rows: [9, 9] }]] + testDataFrameDomain( + 'df <- data.frame()', + [['1@df', { colnames: [], cols: [0, 0], rows: [0, 0] }]] ); - testDataFrameDomainAgainstReal( - shell, - 'df <- data.frame(c(1, 2, 3:5, c(6, 7, c(8, 9))), c("a", "b", "c"))', - ['1@df'], - { colnames: DomainMatchingType.Overapproximation } + testDataFrameDomain( + ` +df1 <- data.frame(id = 1:5) +df2 <- df1 + `.trim(), + [ + ['1@df1', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df1', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df2', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }] + ] ); - assertDataFrameDomain( - shell, - 'df <- data.frame()', - [['1@df', { colnames: [], cols: [0, 0], rows: [0, 0] }]] + testDataFrameDomain( + 'df <- read.csv(text = "id,age\\n1,30\\n2,50\\n3,45")', + [['1@df', DataFrameTop, DataFrameTestOverapproximation]] ); - testDataFrameDomainAgainstReal( - shell, - 'df <- data.frame()', - ['1@df'] + testDataFrameDomain( + 'df <- eval(parse(text = "data.frame()"))', + [['1@df', DataFrameTop, DataFrameTestOverapproximation]] ); - assertDataFrameDomain( - shell, - 'df1 <- data.frame(id = 1:5); df2 <- df1', + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, type = c("A", "B", "C")) +df <- data.frame() +print(df) + `.trim(), [ - ['1@df1', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], - ['1@df2', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }] + ['1@df', { colnames: ['id', 'type'], cols: [2, 2], rows: [3, 3] }], + ['2@df', { colnames: [], cols: [0, 0], rows: [0, 0] }], + ['3@df', { colnames: [], cols: [0, 0], rows: [0, 0] }] ] ); - testDataFrameDomainAgainstReal( - shell, - 'df1 <- data.frame(id = 1:5); df2 <- df1', - ['1@df1', '1@df2'] + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, type = c("A", "B", "C")) +print(df <- data.frame()) +print(df) + `.trim(), + [ + ['1@df', { colnames: ['id', 'type'], cols: [2, 2], rows: [3, 3] }], + ['2@df', { colnames: [], cols: [0, 0], rows: [0, 0] }], + ['3@df', { colnames: [], cols: [0, 0], rows: [0, 0] }] + ] + ); + + testDataFrameDomain( + 'df <- 1:3 |> data.frame(type = c("A", "B", "C"))', + [['1@df', { colnames: ColNamesTop, cols: [2, 2], rows: [3, 3] }, { colnames: DomainMatchingType.Overapproximation }]] ); assertDataFrameDomain( shell, - 'df <- read.csv("test.csv")', + 'df <- if (runif(1) >= 0.5) data.frame(id = 1:5)', [['1@df', DataFrameTop]] ); - testDataFrameDomainAgainstReal( - shell, - 'df <- read.csv(text = "id,age\\n1,30\\n2,50\\n3,45")', - ['1@df'], - DataFrameTestOverapproximation + testDataFrameDomain( + 'df <- if (runif(1) >= 0.5) data.frame(id = 1:5) else data.frame(id = 1:10, name = "A")', + [['1@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [5, 10] }, DataFrameTestOverapproximation]] ); - assertDataFrameDomain( - shell, - 'df <- eval(parse(text = "data.frame()"))', - [['1@df', DataFrameTop]] + testDataFrameDomain( + ` +if(runif(1) >= 0.5) { + df <- data.frame(id = 1:5) +} else { + df <- data.frame(id = 1:10, name = "A") +} +print(df) + `.trim(), + [['6@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [5, 10] }, DataFrameTestOverapproximation]] ); - testDataFrameDomainAgainstReal( - shell, - 'df <- eval(parse(text = "data.frame()"))', - ['1@df'], - DataFrameTestOverapproximation + testDataFrameDomain( + ` +df <- data.frame(id = 1:5) +for (i in 1:5) { + df[["name"]] +} +df[10, ] +print(df) + `.trim(), + [['6@df', { colnames: ['id', 'name'], cols: [1, 1], rows: [5, 10] }, { colnames: DomainMatchingType.Overapproximation, rows: DomainMatchingType.Overapproximation }]] ); - assertDataFrameDomain( - shell, - 'df <- data.frame(id = 1:3, type = c("A", "B", "C"))\ndf <- data.frame()\nprint(df)', - [['3@df', { colnames: [], cols: [0, 0], rows: [0, 0] }]] + testDataFrameDomain( + ` +df <- data.frame(id = 1:5) +for (i in 1:5) { + break + df[["name"]] +} +df[10, ] +print(df) + `.trim(), + [['7@df', { colnames: ['id'], cols: [1, 1], rows: [5, 10] }, { colnames: DomainMatchingType.Overapproximation, rows: DomainMatchingType.Overapproximation }]] ); - testDataFrameDomainAgainstReal( - shell, - 'df <- data.frame(id = 1:3, type = c("A", "B", "C"))\ndf <- data.frame()\nprint(df)', - ['3@df'] + testDataFrameDomain( + ` +df <- data.frame(id = 1:5) +while (TRUE) { + df[["name"]] + break +} +df[10, ] +print(df) + `.trim(), + [['7@df', { colnames: ['id', 'name'], cols: [1, 1], rows: [5, 10] }, DataFrameTestOverapproximation]] ); assertDataFrameDomain( - shell, - 'df <- data.frame(id = 1:3, type = c("A", "B", "C"))\nprint(df <- data.frame())\nprint(df)', - [['3@df', { colnames: [], cols: [0, 0], rows: [0, 0] }]] + shell, ` +df <- data.frame(id = 1:5) +repeat { + df[["name"]] +} +df[10, ] +print(df) + `.trim(), + [['6@df', DataFrameTop]] ); - testDataFrameDomainAgainstReal( - shell, - 'df <- data.frame(id = 1:3, type = c("A", "B", "C"))\nprint(df <- data.frame())\nprint(df)', - ['3@df'] + testDataFrameDomain( + ` +df <- data.frame(id = 1:5) +df <- cbind(df, name = 6:10, label = c("A", "B", "C", "D", "E")) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [5, 5] }] + ] ); - assertDataFrameDomain( - shell, - 'df <- 1:3 |> data.frame(type = c("A", "B", "C"))', - [['1@df', { colnames: ColNamesTop, cols: [2, 2], rows: [3, 3] }]] + testDataFrameDomain( + ` +df1 <- data.frame(id = 1:5) +df2 <- data.frame(name = 6:10) +df3 <- data.frame(label = c("A", "B", "C", "D", "E")) +df <- cbind(df1, df2, df3) + `.trim(), + [ + ['1@df1', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df2', { colnames: ['name'], cols: [1, 1], rows: [5, 5] }], + ['3@df3', { colnames: ['label'], cols: [1, 1], rows: [5, 5] }], + ['4@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [5, 5] }] + ] ); - testDataFrameDomainAgainstReal( - shell, - 'df <- 1:3 |> data.frame(type = c("A", "B", "C"))', - ['1@df'], - { colnames: DomainMatchingType.Overapproximation } + testDataFrameDomain( + ` +df1 <- data.frame(id = 1:5) +df2 <- data.frame(name = 6:10) +df <- cbind(df1, df2, label = c("A", "B", "C", "D", "E")) + `.trim(), + [ + ['1@df1', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df2', { colnames: ['name'], cols: [1, 1], rows: [5, 5] }], + ['3@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [5, 5] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:5) +df <- cbind(df, label = list(name = 6:10)) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df', { colnames: ColNamesTop, cols: IntervalTop, rows: [5, 5] }, { colnames: DomainMatchingType.Overapproximation, cols: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1, name = "A", score = 20) +df <- rbind(df, c(2, "B", 30), c(4, "C", 25)) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name', 'score'], cols: [3, 3], rows: [1, 1] }], + ['2@df', { colnames: ['id', 'name', 'score'], cols: [3, 3], rows: [3, 3] }] + ] + ); + + testDataFrameDomain( + ` +df1 <- data.frame(id = 1:3, name = c("A", "B", "C"), score = c(20, 30, 25)) +df2 <- data.frame(id = 4, name = "D", score = 20) +df3 <- data.frame(id = 5, name = "E", score = 40) +df <- rbind(df1, df2, df3) + `.trim(), + [ + ['1@df1', { colnames: ['id', 'name', 'score'], cols: [3, 3], rows: [3, 3] }], + ['2@df2', { colnames: ['id', 'name', 'score'], cols: [3, 3], rows: [1, 1] }], + ['3@df3', { colnames: ['id', 'name', 'score'], cols: [3, 3], rows: [1, 1] }], + ['4@df', { colnames: ['id', 'name', 'score'], cols: [3, 3], rows: [5, 5] }] + ] + ); + + testDataFrameDomain( + ` +df1 <- data.frame(id = 1:3, name = c("A", "B", "C"), score = c(20, 30, 25)) +df2 <- data.frame(id = 4, name = "D", score = 20) +df <- rbind(df1, df2, label = c(5, "E", 40)) + `.trim(), + [ + ['1@df1', { colnames: ['id', 'name', 'score'], cols: [3, 3], rows: [3, 3] }], + ['2@df2', { colnames: ['id', 'name', 'score'], cols: [3, 3], rows: [1, 1] }], + ['3@df', { colnames: ['id', 'name', 'score'], cols: [3, 3], rows: [5, 5] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:5) +df <- rbind(df, list(id = 6:10)) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df', { colnames: ['id'], cols: [1, 1], rows: IntervalTop }, { rows: DomainMatchingType.Overapproximation }] + ] ); })); From d4f70c842df84cbec0dda544215d6bc2cc16459b Mon Sep 17 00:00:00 2001 From: Oliver Date: Thu, 1 May 2025 22:37:50 +0200 Subject: [PATCH 11/11] Add abstract semantics for remaining data frame operations (#1583) * feat: basic fold over CFG for forward traversal * feat: add simplified forward-connected control flow graph * feat-fix: correct label for CFG exit point edges * test: unit tests for simple control flow graph * feat: add CFG visitor for fixpoint iteration for data frames * feat-fix: add missing negation in has-changed check * feat: setup for data frame processors and semantics * feat: directly evaluate arguments when processing function * feat: add semantics mapper for column/row access and assignment * feat: implement data frame processors for basic nodes * feat: return resulting data frame domain * feat: support different types of inferred constraints * test: add tests for data frame state domain * feat: store abstract state in nodes * feat: support control flow constructs in abstract interpretation * test: add tests for control flow support * feat: support colnames assignment, cbind and rbind * feat: support data frame head and tail function * feat: support data frame subsetting via access operator * feat: support data frame subset, filter, and select * feat: support column and row assignment and magrittr pipe * feat: support mutate, left_join and group_by * test-fix: try to fix CI data frame test errors * test-fix: skip tests with library to test CI * test-fix: check evaluation of R code with library call in CI * test-fix: unload library after test * test-fix: restart R shell after library test * refactor: restructure semantics and processors * feat: support summarize and limit col names * feat: basic delayed widening for fixpoint iteration * test: add test for multiple else-if branches * feat: use dataflow graph to resolve origin processor --- .../data-frame/absint-info.ts | 10 +- .../data-frame/abstract-interpretation.ts | 38 +- .../data-frame/domain.ts | 77 +- .../data-frame/mappers/access-mapper.ts | 174 +++-- .../data-frame/mappers/assignment-mapper.ts | 178 +---- .../data-frame/mappers/function-mapper.ts | 644 +++++++++++++--- .../data-frame/mappers/replacement-mapper.ts | 167 +++++ .../data-frame/processor.ts | 113 +-- .../data-frame/resolve-args.ts | 17 +- .../data-frame/semantics-mapper.ts | 22 +- .../data-frame/semantics.ts | 159 +++- src/dataflow/environments/resolve-by-name.ts | 12 + src/util/cfg/cfg.ts | 4 - .../data-frame/data-frame.ts | 40 +- .../data-frame/inference.test.ts | 704 +++++++++++++++++- 15 files changed, 1842 insertions(+), 517 deletions(-) create mode 100644 src/abstract-interpretation/data-frame/mappers/replacement-mapper.ts diff --git a/src/abstract-interpretation/data-frame/absint-info.ts b/src/abstract-interpretation/data-frame/absint-info.ts index 41f1173e4a..8048c23887 100644 --- a/src/abstract-interpretation/data-frame/absint-info.ts +++ b/src/abstract-interpretation/data-frame/absint-info.ts @@ -8,7 +8,7 @@ export interface DataFrameOperation { args: DataFrameOperationArgs } -type DataFrameOperations = { +export type DataFrameOperations = { [Name in DataFrameOperationName]: DataFrameOperation; }[DataFrameOperationName]; @@ -27,12 +27,8 @@ export interface DataFrameExpressionInfo { operations: DataFrameOperations[] } -export interface DataFrameOtherInfo { - type: 'other' -} - -export type DataFrameInfo = DataFrameAssignmentInfo | DataFrameExpressionInfo | DataFrameOtherInfo; +export type DataFrameInfo = DataFrameAssignmentInfo | DataFrameExpressionInfo; export interface AbstractInterpretationInfo { - dataFrame?: DataFrameInfo & DataFrameInfoBase + dataFrame?: (DataFrameInfo & DataFrameInfoBase) | DataFrameInfoBase } diff --git a/src/abstract-interpretation/data-frame/abstract-interpretation.ts b/src/abstract-interpretation/data-frame/abstract-interpretation.ts index 2c0e71100d..4cf5bf11e3 100644 --- a/src/abstract-interpretation/data-frame/abstract-interpretation.ts +++ b/src/abstract-interpretation/data-frame/abstract-interpretation.ts @@ -6,9 +6,11 @@ import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; import { CfgVertexType, ControlFlowGraph, type CfgVertex, type ControlFlowInformation } from '../../util/cfg/cfg'; import type { AbstractInterpretationInfo } from './absint-info'; import type { DataFrameDomain, DataFrameStateDomain } from './domain'; -import { equalDataFrameState, joinDataFrameStates } from './domain'; +import { equalDataFrameState, joinDataFrameStates, wideningDataFrameStates } from './domain'; import { processDataFrameExpression, processDataFrameLeaf } from './processor'; +const WideningThreshold = 4; + export function performDataFrameAbsint(cfinfo: ControlFlowInformation, dfg: DataflowGraph): DataFrameStateDomain { const visited: Map = new Map(); let finalDomain: DataFrameStateDomain = new Map(); @@ -23,27 +25,39 @@ export function performDataFrameAbsint(cfinfo: ControlFlowInformation, dfg: Data let newDomain = inputDomain; const entryNode: RNode | undefined = dfg.idMap?.get(vertex.id); + let node: RNode | undefined; if(entryNode !== undefined && isRSingleNode(entryNode)) { oldDomain = entryNode.info.dataFrame?.domain ?? oldDomain; newDomain = processDataFrameLeaf(entryNode, new Map(inputDomain), dfg); - } - if(vertex.type === CfgVertexType.EndMarker) { + node = entryNode; + } else if(vertex.type === CfgVertexType.EndMarker) { const exitId = getNodeIdForExitVertex(vertex.id); const exitNode: RNode | undefined = exitId !== undefined ? dfg.idMap?.get(exitId) : undefined; if(exitNode !== undefined && !isRSingleNode(exitNode)) { oldDomain = exitNode.info.dataFrame?.domain ?? oldDomain; newDomain = processDataFrameExpression(exitNode, new Map(inputDomain), dfg); + node = exitNode; } } if(cfinfo.exitPoints.includes(vertex.id)) { finalDomain = newDomain; } - visited.set(vertex.id, (visited.get(vertex.id) ?? 0) + 1); + const visitedCount = visited.get(vertex.id) ?? 0; + visited.set(vertex.id, visitedCount + 1); - return getSuccessorVertices(cfg, vertex.id, dfg) - .filter(successor => !visited.has(successor.id) || !equalDataFrameState(newDomain, oldDomain)); + if(visitedCount >= WideningThreshold) { + newDomain = wideningDataFrameStates(oldDomain, newDomain); + } + if(node !== undefined) { + node.info.dataFrame ??= {}; + node.info.dataFrame.domain = new Map(newDomain); + } + if(!equalDataFrameState(oldDomain, newDomain)) { + return getSuccessorVertices(cfg, vertex.id, dfg); + } + return getSuccessorVertices(cfg, vertex.id, dfg).filter(successor => !visited.has(successor.id)); }; const cfg = flipCfg(cfinfo.graph); const entryPoints = cfinfo.entryPoints @@ -79,15 +93,15 @@ function foldCfg( } } -function isRConstant( - node: RNode -): node is RConstant { +function isRConstant( + node: RNode +): node is RConstant { return node.type === RType.String || node.type === RType.Number || node.type === RType.Logical; } -function isRSingleNode( - node: RNode -): node is RSingleNode { +function isRSingleNode( + node: RNode +): node is RSingleNode { return isRConstant(node) || node.type === RType.Symbol || node.type === RType.Break || node.type === RType.Next || node.type === RType.Comment || node.type === RType.LineDirective; } diff --git a/src/abstract-interpretation/data-frame/domain.ts b/src/abstract-interpretation/data-frame/domain.ts index eaf24f3ac7..55c8c4b6d8 100644 --- a/src/abstract-interpretation/data-frame/domain.ts +++ b/src/abstract-interpretation/data-frame/domain.ts @@ -1,6 +1,8 @@ import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; import { setEquals } from '../../util/set'; +const MaxColNames = 50; + type Interval = [number, number]; export const IntervalBottom = 'bottom'; @@ -42,15 +44,14 @@ export function leqColNames(X1: ColNamesDomain, X2: ColNamesDomain): boolean { export function joinColNames(X1: ColNamesDomain, X2: ColNamesDomain): ColNamesDomain { if(X1 === ColNamesTop || X2 === ColNamesTop) { return ColNamesTop; - } else { - return Array.from(new Set(X1).union(new Set(X2))); } + const join = Array.from(new Set(X1).union(new Set(X2))); + + return join.length > MaxColNames ? ColNamesTop : join; } export function meetColNames(X1: ColNamesDomain, X2: ColNamesDomain): ColNamesDomain { - if(X1 === ColNamesTop && X2 === ColNamesTop) { - return ColNamesTop; - } else if(X1 === ColNamesTop) { + if(X1 === ColNamesTop) { return X2; } else if(X2 === ColNamesTop) { return X1; @@ -69,6 +70,11 @@ export function subtractColNames(X1: ColNamesDomain, X2: ColNamesDomain): ColNam } } +/** We just join here since we have an upper limit {@link MaxColNames} for {@link joinColNames} */ +export function wideningColNames(X1: ColNamesDomain, X2: ColNamesDomain): ColNamesDomain { + return joinColNames(X1, X2); +} + export function equalInterval(X1: IntervalDomain, X2: IntervalDomain): boolean { return X1 === X2 || (X1 !== IntervalBottom && X1[0] === X2[0] && X1[1] === X2[1]); } @@ -78,9 +84,7 @@ export function leqInterval(X1: IntervalDomain, X2: IntervalDomain): boolean { } export function joinInterval(X1: IntervalDomain, X2: IntervalDomain): IntervalDomain { - if(X1 === IntervalBottom && X2 === IntervalBottom) { - return IntervalBottom; - } else if(X1 === IntervalBottom) { + if(X1 === IntervalBottom) { return X2; } else if(X2 === IntervalBottom) { return X1; @@ -111,7 +115,23 @@ export function subtractInterval(X1: IntervalDomain, X2: IntervalDomain): Interv if(X1 === IntervalBottom || X2 === IntervalBottom) { return IntervalBottom; } else { - return [X1[0] - X2[0], X1[1] - X2[1]]; + return [Math.max(X1[0] - X2[0], 0), Math.max(X1[1] - X2[1], 0)]; + } +} + +export function minInterval(X1: IntervalDomain, X2: IntervalDomain): IntervalDomain { + if(X1 === IntervalBottom || X2 === IntervalBottom) { + return IntervalBottom; + } else { + return [Math.min(X1[0], X2[0]), Math.min(X1[1], X2[1])]; + } +} + +export function maxInterval(X1: IntervalDomain, X2: IntervalDomain): IntervalDomain { + if(X1 === IntervalBottom || X2 === IntervalBottom) { + return IntervalBottom; + } else { + return [Math.max(X1[0], X2[0]), Math.max(X1[1], X2[1])]; } } @@ -123,6 +143,24 @@ export function includeZeroInterval(X: IntervalDomain): IntervalDomain { } } +export function includeInfinityInterval(X: IntervalDomain): IntervalDomain { + if(X === IntervalBottom) { + return IntervalBottom; + } else { + return [X[0], Infinity]; + } +} + +export function wideningInterval(X1: IntervalDomain, X2: IntervalDomain): IntervalDomain { + if(X1 === IntervalBottom) { + return X2; + } else if(X2 === IntervalBottom) { + return X1; + } else { + return [X1[0] <= X2[0] ? X1[0] : 0, X1[1] >= X2[1] ? X1[1] : Infinity]; + } +} + export function equalDataFrameDomain(X1: DataFrameDomain, X2: DataFrameDomain): boolean { return equalColNames(X1.colnames, X2.colnames) && equalInterval(X1.cols, X2.cols) && equalInterval(X1.rows, X2.rows); } @@ -143,6 +181,14 @@ export function meetDataFrames(...values: DataFrameDomain[]): DataFrameDomain { }), values[0] ?? DataFrameTop); } +export function wideningDataFrames(X1: DataFrameDomain, X2: DataFrameDomain): DataFrameDomain { + return { + colnames: wideningColNames(X1.colnames, X2.colnames), + cols: wideningInterval(X1.cols, X2.cols), + rows: wideningInterval(X1.rows, X2.rows) + }; +} + export function equalDataFrameState(R1: DataFrameStateDomain, R2: DataFrameStateDomain): boolean { if(R1.size !== R2.size) { return false; @@ -185,3 +231,16 @@ export function meetDataFrameStates(...values: DataFrameStateDomain[]): DataFram } return result; } + +export function wideningDataFrameStates(X1: DataFrameStateDomain, X2: DataFrameStateDomain): DataFrameStateDomain { + const result = new Map(X1); + + for(const [nodeId, value] of X2) { + if(result.has(nodeId)) { + result.set(nodeId, wideningDataFrames(result.get(nodeId) ?? DataFrameTop, value)); + } else { + result.set(nodeId, value); + } + } + return result; +} diff --git a/src/abstract-interpretation/data-frame/mappers/access-mapper.ts b/src/abstract-interpretation/data-frame/mappers/access-mapper.ts index 9a10a83365..7eee56afa6 100644 --- a/src/abstract-interpretation/data-frame/mappers/access-mapper.ts +++ b/src/abstract-interpretation/data-frame/mappers/access-mapper.ts @@ -6,114 +6,140 @@ import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nod import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; -import type { DataFrameInfo } from '../absint-info'; -import { resolveIdToArgValue, resolveIdToArgValueSymbolName } from '../resolve-args'; +import type { AbstractInterpretationInfo, DataFrameInfo, DataFrameOperations } from '../absint-info'; +import { resolveIdToArgName, resolveIdToArgValue, resolveIdToArgValueSymbolName, unescapeArgument } from '../resolve-args'; import { isStringBasedAccess } from '../semantics-mapper'; -const SpecialAccessArgumentsMapper: Partial> = { +const SpecialAccessArgumentsMapper: Record = { '[': ['drop'], '[[': ['exact'] }; -export function mapDataFrameAccess( - node: RNode, +export function mapDataFrameAccess( + node: RNode, dfg: DataflowGraph ): DataFrameInfo | undefined { if(node.type === RType.Access) { + let operations: DataFrameOperations[] | undefined; + if(isStringBasedAccess(node)) { - return mapDataFrameNamedColumnAccess(node, { graph: dfg, idMap: dfg.idMap, full: true }); + operations = mapDataFrameNamedColumnAccess(node, { graph: dfg, idMap: dfg.idMap, full: true }); } else { - return mapDataFrameIndexColRowAccess(node, { graph: dfg, idMap: dfg.idMap, full: true }); + operations = mapDataFrameIndexColRowAccess(node, { graph: dfg, idMap: dfg.idMap, full: true }); + } + if(operations !== undefined) { + return { type: 'expression', operations: operations }; } } } -function mapDataFrameNamedColumnAccess( - access: RNamedAccess, +function mapDataFrameNamedColumnAccess( + access: RNamedAccess, info: ResolveInfo -): DataFrameInfo { +): DataFrameOperations[] | undefined { + const dataFrame = access.accessed; + + if(dataFrame.info.dataFrame?.domain?.get(dataFrame.info.id) === undefined) { + return; + } const argName = resolveIdToArgValueSymbolName(access.access[0], info); - return { - type: 'expression', - operations: [{ - operation: 'accessCol', - operand: access.accessed.info.id, - args: { columns: argName ? [argName] : undefined } - }] - }; + return [{ + operation: 'accessCols', + operand: dataFrame.info.id, + args: { columns: argName ? [argName] : undefined } + }]; } -function mapDataFrameIndexColRowAccess( - access: RIndexAccess, +function mapDataFrameIndexColRowAccess( + access: RIndexAccess, info: ResolveInfo -): DataFrameInfo { +): DataFrameOperations[] | undefined { + const dataFrame = access.accessed; + + if(dataFrame.info.dataFrame?.domain?.get(dataFrame.info.id) === undefined) { + return; + } const args = getEffectiveArgs(access.operator, access.access); if(args.every(arg => arg === EmptyArgument)) { - return { - type: 'expression', - operations: [{ - operation: 'identity', - operand: access.accessed.info.id, - args: {} - }] - }; - } else if(args.length > 0 && args.length <= 2) { - const rowArg = args.length < 2 ? undefined : args[0]; - const colArg = args.length < 2 ? args[0] : args[1]; - - const result: DataFrameInfo = { type: 'expression', operations: [] }; + return [{ + operation: 'identity', + operand: dataFrame.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + const dropArg = access.access.find(arg => resolveIdToArgName(arg, info) === 'drop'); + const dropValue = dropArg !== undefined ? resolveIdToArgValue(dropArg, info) : undefined; + const rowArg = args.length < 2 ? undefined : args[0]; + const colArg = args.length < 2 ? args[0] : args[1]; + let rows: number[] | undefined = undefined; + let columns: string[] | number[] | undefined = undefined; + + if(rowArg !== undefined && rowArg !== EmptyArgument) { + const rowValue: unknown = resolveIdToArgValue(rowArg, info); + + if(typeof rowValue === 'number') { + rows = [rowValue]; + } else if(Array.isArray(rowValue) && rowValue.every(row => typeof row === 'number')) { + rows = rowValue; + } + result.push({ + operation: 'accessRows', + operand: dataFrame.info.id, + args: { rows: rows?.map(Math.abs) } + }); + } + if(colArg !== undefined && colArg !== EmptyArgument) { + const colValue: unknown = resolveIdToArgValue(colArg, info); + + if(typeof colValue === 'string') { + columns = [colValue]; + } else if(typeof colValue === 'number') { + columns = [colValue]; + } else if(Array.isArray(colValue) && (colValue.every(col => typeof col === 'string') || colValue.every(col => typeof col === 'number'))) { + columns = colValue; + } + result.push({ + operation: 'accessCols', + operand: dataFrame.info.id, + args: { columns: columns?.every(col => typeof col === 'number') ? columns.map(Math.abs) : columns } + }); + } + // The data frame extent is dropped if the operator `[[` is used, the argument `drop` is true, or only one column is accessed + const dropExtent = access.operator === '[[' ? true : + args.length === 2 && typeof dropValue === 'boolean' ? dropValue : + rowArg !== undefined && columns?.length === 1 && (typeof columns[0] === 'string' || columns[0] > 0); + + if(!dropExtent) { + let operand: RNode | undefined = dataFrame; if(rowArg !== undefined && rowArg !== EmptyArgument) { - const rowValue: unknown = resolveIdToArgValue(rowArg, info); - let rows: number[] | undefined = undefined; - - if(typeof rowValue === 'number') { - rows = [rowValue]; - } else if(Array.isArray(rowValue) && rowValue.every(row => typeof row === 'number')) { - rows = rowValue; - } - result.operations.push({ - operation: 'accessRow', - operand: access.accessed.info.id, - args: { rows } + result.push({ + operation: rows === undefined || rows?.every(row => row >= 0) ? 'subsetRows' : 'removeRows', + operand: operand?.info.id, + args: { rows: rows?.length } }); + operand = undefined; } if(colArg !== undefined && colArg !== EmptyArgument) { - const colValue: unknown = resolveIdToArgValue(colArg, info); - let columns: string[] | number[] | undefined = undefined; - - if(typeof colValue === 'string') { - columns = [colValue]; - } else if(typeof colValue === 'number') { - columns = [colValue]; - } else if(Array.isArray(colValue) && (colValue.every(col => typeof col === 'string') || colValue.every(col => typeof col === 'number'))) { - columns = colValue; - } - result.operations.push({ - operation: 'accessCol', - operand: access.accessed.info.id, - args: { columns } + result.push({ + operation: columns === undefined || columns?.every(col => typeof col === 'string' || col >= 0) ? 'subsetCols' : 'removeCols', + operand: operand?.info.id, + args: { colnames: columns?.map(col => typeof col === 'string' ? col : undefined) } }); + operand = undefined; } return result; } - return { - type: 'expression', - operations: [{ - operation: 'unknown', - operand: access.accessed.info.id, - args: { modifyInplace: true } - }] - }; } -function getEffectiveArgs( - funct: keyof typeof SpecialAccessArgumentsMapper, - args: readonly RFunctionArgument[] -): readonly RFunctionArgument[] { - const ignoredArgs = SpecialAccessArgumentsMapper[funct] ?? []; +function getEffectiveArgs( + operator: RIndexAccess['operator'], + args: readonly RFunctionArgument[] +): readonly RFunctionArgument[] { + const specialArgs = SpecialAccessArgumentsMapper[operator]; - return args.filter(arg => arg === EmptyArgument || arg.name === undefined || !ignoredArgs.includes(arg.name.content)); + return args.filter(arg => arg === EmptyArgument || arg.name === undefined || !specialArgs.includes(unescapeArgument(arg.name.content))); } diff --git a/src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts b/src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts index 95d5a6a337..144c8503b6 100644 --- a/src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts +++ b/src/abstract-interpretation/data-frame/mappers/assignment-mapper.ts @@ -1,188 +1,30 @@ -import type { ResolveInfo } from '../../../dataflow/environments/resolve-by-name'; -import type { DataflowGraph } from '../../../dataflow/graph/graph'; -import { toUnnamedArgument } from '../../../dataflow/internal/process/functions/call/argument/make-argument'; import type { RNode } from '../../../r-bridge/lang-4.x/ast/model/model'; -import type { RIndexAccess, RNamedAccess } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-access'; -import type { RFunctionArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; -import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; import type { RString } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-string'; import type { RSymbol } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-symbol'; import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; -import type { DataFrameInfo } from '../absint-info'; -import { resolveIdToArgStringVector, resolveIdToArgValue, resolveIdToArgValueSymbolName } from '../resolve-args'; -import { isStringBasedAccess } from '../semantics-mapper'; +import type { AbstractInterpretationInfo, DataFrameAssignmentInfo, DataFrameInfo } from '../absint-info'; -const DataFrameAssignmentFunctionMapper = { - 'colnames': mapDataFrameColNamesAssignment, - 'names': mapDataFrameColNamesAssignment, - 'rownames': mapDataFrameRowNamesAssignment, - 'dimnames': mapDataFrameDimNamesAssignment -} as const satisfies Record; - -type DataFrameAssignmentFunctionMapping = ( - operand: RFunctionArgument, - expression: RNode, - info: ResolveInfo -) => DataFrameInfo | undefined; - -type DataFrameAssignmentFunction = keyof typeof DataFrameAssignmentFunctionMapper; - -export function mapDataFrameAssignment( - node: RNode, - dfg: DataflowGraph +export function mapDataFrameAssignment( + node: RNode ): DataFrameInfo | undefined { if(node.type === RType.BinaryOp && node.lhs !== undefined && node.rhs !== undefined) { if(node.lhs.type === RType.Symbol || node.lhs.type === RType.String) { return mapDataFrameVariableAssignment(node.lhs, node.rhs); - } else if(node.lhs.type === RType.Access) { - if(isStringBasedAccess(node.lhs)) { - return mapDataFrameNamedColumnAssignment(node.lhs, node.rhs, { graph: dfg, idMap: dfg.idMap, full: true }); - } else { - return mapDataFrameIndexColRowAssignment(node.lhs, node.rhs, { graph: dfg, idMap: dfg.idMap, full: true }); - } - } else if(node.lhs.type === RType.FunctionCall && node.lhs.named) { - if(node.lhs.functionName.content in DataFrameAssignmentFunctionMapper && node.lhs.arguments.length > 0) { - const functionName = node.lhs.functionName.content as DataFrameAssignmentFunction; - const functionProcessor = DataFrameAssignmentFunctionMapper[functionName]; - - return functionProcessor(node.lhs.arguments[0], node.rhs, { graph: dfg, idMap: dfg.idMap, full: true }); - } } } } -function mapDataFrameVariableAssignment( - identifier: RSymbol | RString, - expression: RNode -): DataFrameInfo { +export function mapDataFrameVariableAssignment( + identifier: RSymbol | RString, + expression: RNode +): DataFrameAssignmentInfo | undefined { + if(expression.info.dataFrame?.domain?.get(expression.info.id) === undefined) { + return; + } return { type: 'assignment', identifier: identifier.info.id, expression: expression.info.id }; } - -function mapDataFrameNamedColumnAssignment( - access: RNamedAccess, - expression: RNode, - info: ResolveInfo -): DataFrameInfo { - const argName = resolveIdToArgValueSymbolName(access.access[0], info); - - return { - type: 'expression', - operations: [{ - operation: 'assignCol', - operand: access.accessed.info.id, - args: { columns: argName ? [argName] : undefined } - }] - }; -} - -function mapDataFrameIndexColRowAssignment( - access: RIndexAccess, - expression: RNode, - info: ResolveInfo -): DataFrameInfo { - const args = access.access; - - if(args.length === 0 || args.every(arg => arg === EmptyArgument)) { - return { - type: 'expression', - operations: [{ - operation: 'identity', - operand: access.accessed.info.id, - args: {} - }] - }; - } - const rowArg = args.length < 2 ? undefined : args[0]; - const colArg = args.length < 2 ? args[0] : args[1]; - - const result: DataFrameInfo = { type: 'expression', operations: [] }; - - if(rowArg !== undefined && rowArg !== EmptyArgument) { - const rowValue: unknown = resolveIdToArgValue(rowArg, info); - let rows: number[] | undefined = undefined; - - if(typeof rowValue === 'number') { - rows = [rowValue]; - } else if(Array.isArray(rowValue) && rowValue.every(row => typeof row === 'number')) { - rows = rowValue; - } - result.operations.push({ - operation: 'assignRow', - operand: access.accessed.info.id, - args: { rows } - }); - } - if(colArg !== undefined && colArg !== EmptyArgument) { - const colValue: unknown = resolveIdToArgValue(colArg, info); - let columns: string[] | number[] | undefined = undefined; - - if(typeof colValue === 'string') { - columns = [colValue]; - } else if(typeof colValue === 'number') { - columns = [colValue]; - } else if(Array.isArray(colValue) && (colValue.every(col => typeof col === 'string') || colValue.every(col => typeof col === 'number'))) { - columns = colValue; - } - result.operations.push({ - operation: 'assignCol', - operand: access.accessed.info.id, - args: { columns } - }); - } - return result; -} - -function mapDataFrameColNamesAssignment( - operand: RFunctionArgument, - expression: RNode, - info: ResolveInfo -): DataFrameInfo | undefined { - if(operand !== EmptyArgument && operand?.value !== undefined && info.idMap) { - const argument = toUnnamedArgument(expression, info.idMap); - const assignedNames = resolveIdToArgStringVector(argument, info); - - return { - type: 'expression', - operations: [{ - operation: 'setColNames', - operand: operand.value.info.id, - args: { colnames: assignedNames } - }] - }; - } -} - -function mapDataFrameRowNamesAssignment( - operand: RFunctionArgument -): DataFrameInfo | undefined { - if(operand !== EmptyArgument && operand?.value !== undefined) { - return { - type: 'expression', - operations: [{ - operand: operand.value.info.id, - operation: 'identity', - args: {} - }] - }; - } -} - -function mapDataFrameDimNamesAssignment( - operand: RFunctionArgument -): DataFrameInfo | undefined { - if(operand !== EmptyArgument && operand.value !== undefined) { - return { - type: 'expression', - operations: [{ - operand: operand.value.info.id, - operation: 'unknown', - args: { modifyInplace: true } - }] - }; - } -} diff --git a/src/abstract-interpretation/data-frame/mappers/function-mapper.ts b/src/abstract-interpretation/data-frame/mappers/function-mapper.ts index 3072d32e8c..19ee2371d7 100644 --- a/src/abstract-interpretation/data-frame/mappers/function-mapper.ts +++ b/src/abstract-interpretation/data-frame/mappers/function-mapper.ts @@ -1,120 +1,130 @@ import type { ResolveInfo } from '../../../dataflow/environments/resolve-by-name'; import type { DataflowGraph } from '../../../dataflow/graph/graph'; -import { VertexType } from '../../../dataflow/graph/vertex'; +import { isUseVertex, VertexType } from '../../../dataflow/graph/vertex'; import { toUnnamedArgument } from '../../../dataflow/internal/process/functions/call/argument/make-argument'; import type { RNode } from '../../../r-bridge/lang-4.x/ast/model/model'; +import type { RArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-argument'; import type { RFunctionArgument, RFunctionCall } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; -import { startAndEndsWith } from '../../../util/strings'; -import type { AbstractInterpretationInfo, DataFrameInfo } from '../absint-info'; +import type { AbstractInterpretationInfo, DataFrameInfo, DataFrameOperations } from '../absint-info'; import { DataFrameTop } from '../domain'; -import { resolveIdToArgName, resolveIdToArgValue, resolveIdToArgVectorLength } from '../resolve-args'; +import { resolveIdToArgName, resolveIdToArgValue, resolveIdToArgValueSymbolName, resolveIdToArgVectorLength, unescapeArgument } from '../resolve-args'; const ColNamesRegex = /^[A-Za-z.][A-Za-z0-9_.]*$/; const DataFrameFunctionMapper = { - 'data.frame': mapDataFrameCreate, - 'as.data.frame': mapDataFrameUnknownCreate, - 'read.csv': mapDataFrameUnknownCreate, - 'read.table': mapDataFrameUnknownCreate, - 'cbind': mapDataFrameColBind, - 'rbind': mapDataFrameRowBind -} as const satisfies Record; - -const SpecialFunctionArgumentsMapper: Partial> = { - 'data.frame': ['row.names', 'check.rows', 'check.names', 'fix.empty.names', 'stringsAsFactors'] -}; - -type DataFrameFunctionMapping = ( - args: readonly RFunctionArgument[], + 'data.frame': { mapper: mapDataFrameCreate, specialArgs: ['row.names', 'check.rows', 'check.names', 'fix.empty.names', 'stringsAsFactors'] }, + 'as.data.frame': { mapper: mapDataFrameIdentity, specialArgs: ['row.names', 'check.rows', 'check.names', 'fix.empty.names', 'stringsAsFactors'] }, + 'read.csv': { mapper: mapDataFrameUnknownCreate, specialArgs: [] }, + 'read.table': { mapper: mapDataFrameUnknownCreate, specialArgs: [] }, + 'cbind': { mapper: mapDataFrameColBind, specialArgs: ['deparse.level', 'make.row.names', 'stringsAsFactors', 'factor.exclude'] }, + 'rbind': { mapper: mapDataFrameRowBind, specialArgs: ['deparse.level', 'make.row.names', 'stringsAsFactors', 'factor.exclude'] }, + 'head': { mapper: mapDataFrameHeadTail, specialArgs: ['addrownums'] }, + 'tail': { mapper: mapDataFrameHeadTail, specialArgs: ['addrownums'] }, + 'subset': { mapper: mapDataFrameSubset, specialArgs: ['drop'] }, + 'filter': { mapper: mapDataFrameFilter, specialArgs: ['.by', '.preserve'] }, + 'select': { mapper: mapDataFrameSelect, specialArgs: [] }, + 'transform': { mapper: mapDataFrameMutate, specialArgs: [] }, + 'mutate': { mapper: mapDataFrameMutate, specialArgs: ['.by', '.keep', '.before', '.after'] }, + 'group_by': { mapper: mapDataFrameGroupBy, specialArgs: ['.add', '.drop'] }, + 'summarise': { mapper: mapDataFrameSummarize, specialArgs: ['.by', '.groups'] }, + 'summarize': { mapper: mapDataFrameSummarize, specialArgs: ['.by', '.groups'] }, + 'left_join': { mapper: mapDataFrameLeftJoin, specialArgs: ['copy', 'suffix', 'keep'] }, + 'merge': { mapper: (...args) => mapDataFrameLeftJoin(...args, true), specialArgs: ['by.x', 'bx.y', 'all', 'all.x', 'all.y', 'sort', 'suffixes', 'no.dups', 'incomparables'] }, + 'relocate': { mapper: mapDataFrameIdentity, specialArgs: ['.before', '.after'] }, + 'arrange': { mapper: mapDataFrameIdentity, specialArgs: ['.by_group', '.locale'] } +} as const satisfies Record; + +type DataFrameFunctionMapperInfo = { + readonly mapper: DataFrameFunctionMapping, + readonly specialArgs: string[] +} + +type DataFrameFunctionMapping = ( + args: readonly RFunctionArgument[], info: ResolveInfo -) => DataFrameInfo | undefined; +) => DataFrameOperations[] | undefined; type DataFrameFunction = keyof typeof DataFrameFunctionMapper; -export function mapDataFrameFunctionCall( - node: RNode, +export function mapDataFrameFunctionCall( + node: RNode, dfg: DataflowGraph ): DataFrameInfo | undefined { if(node.type === RType.FunctionCall && node.named && node.functionName.content in DataFrameFunctionMapper) { - const args = getFunctionArguments(node, dfg); const functionName = node.functionName.content as DataFrameFunction; - const functionProcessor = DataFrameFunctionMapper[functionName]; + const args = getFunctionArguments(node, dfg); + const effectiveArgs = getEffectiveArgs(functionName, args); + const resolveInfo = { graph: dfg, idMap: dfg.idMap, full: true }; + const functionMapping = DataFrameFunctionMapper[functionName]; - return functionProcessor(args, { graph: dfg, idMap: dfg.idMap, full: true }); + const operations = functionMapping.mapper(effectiveArgs, resolveInfo); + + if(operations !== undefined) { + return { type: 'expression', operations: operations }; + } } } -function mapDataFrameCreate( - args: readonly RFunctionArgument[], +function mapDataFrameCreate( + args: readonly RFunctionArgument[], info: ResolveInfo -): DataFrameInfo { - const columnArgs = getEffectiveArgs('data.frame', args); - - const argNames = columnArgs.map(arg => arg ? resolveIdToArgName(arg, info) : undefined).map(unescapeArgument); - const argLengths = columnArgs.map(arg => arg ? resolveIdToArgVectorLength(arg, info) : undefined); +): DataFrameOperations[] { + const argNames = args.map(arg => arg ? resolveIdToArgName(arg, info) : undefined); + const argLengths = args.map(arg => arg ? resolveIdToArgVectorLength(arg, info) : undefined); const colnames = argNames.map(arg => isValidColName(arg) ? arg : undefined); const rows = argLengths.every(arg => arg !== undefined) ? Math.max(...argLengths, 0) : undefined; - return { - type: 'expression', - operations: [{ - operation: 'create', - operand: undefined, - args: { colnames, rows } - }] - }; + return [{ + operation: 'create', + operand: undefined, + args: { colnames: colnames, rows: rows } + }]; } -function mapDataFrameUnknownCreate(): DataFrameInfo { - return { - type: 'expression', - operations: [{ - operation: 'unknown', - operand: undefined, - args: { creation: true } - }] - }; +function mapDataFrameUnknownCreate(): DataFrameOperations[] { + return [{ + operation: 'unknownCreate', + operand: undefined, + args: {} + }]; } -function mapDataFrameColBind( - args: readonly RFunctionArgument[], +function mapDataFrameColBind( + args: readonly RFunctionArgument[], info: ResolveInfo -): DataFrameInfo | undefined { +): DataFrameOperations[] | undefined { const dataFrame = args.find(isDataFrameArgument); - if(dataFrame === undefined || dataFrame === EmptyArgument || dataFrame.value === undefined) { + if(dataFrame === undefined) { return; } else if(args.length === 1) { - return { - type: 'expression', - operations: [{ - operation: 'identity', - operand: dataFrame.value.info.id, - args: {} - }] - }; - } - const result: DataFrameInfo = { type: 'expression', operations: [] }; - let operand: RNode | undefined = dataFrame.value; + return [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + let operand: RNode | undefined = dataFrame.value; let colnames: (string | undefined)[] | undefined = []; for(const arg of args) { if(arg !== dataFrame && arg !== EmptyArgument) { - if(arg.value !== undefined && isDataFrameArgument(arg)) { - const other = arg.value.info.dataFrame?.domain?.get(arg.value.info.id) ?? DataFrameTop; + if(isDataFrameArgument(arg)) { + const otherDataFrame = arg.value.info.dataFrame.domain?.get(arg.value.info.id) ?? DataFrameTop; - result.operations.push({ + result.push({ operation: 'concatCols', operand: operand?.info.id, - args: { other: other } + args: { other: otherDataFrame } }); operand = undefined; - // Added columns are unknown if argument cannot be resolved to constant (vector-like) value + // Added columns are undefined if argument cannot be resolved to constant (vector-like) value } else if(resolveIdToArgValue(arg, info) !== undefined) { - const colname = unescapeArgument(resolveIdToArgName(arg, info)); + const colname = resolveIdToArgName(arg, info); colnames?.push(colname); } else { colnames = undefined; @@ -122,7 +132,7 @@ function mapDataFrameColBind( } } if(colnames === undefined || colnames.length > 0) { - result.operations.push({ + result.push({ operation: 'addCols', operand: operand?.info.id, args: { colnames: colnames } @@ -131,40 +141,37 @@ function mapDataFrameColBind( return result; } -function mapDataFrameRowBind( - args: readonly RFunctionArgument[], +function mapDataFrameRowBind( + args: readonly RFunctionArgument[], info: ResolveInfo -): DataFrameInfo | undefined { +): DataFrameOperations[] | undefined { const dataFrame = args.find(isDataFrameArgument); - if(dataFrame === undefined || dataFrame === EmptyArgument || dataFrame.value === undefined) { + if(dataFrame === undefined) { return; } else if(args.length === 1) { - return { - type: 'expression', - operations: [{ - operation: 'identity', - operand: dataFrame.value.info.id, - args: {} - }] - }; - } - const result: DataFrameInfo = { type: 'expression', operations: [] }; - let operand: RNode | undefined = dataFrame.value; + return [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + let operand: RNode | undefined = dataFrame.value; let rows: number | undefined = 0; for(const arg of args) { if(arg !== dataFrame && arg !== EmptyArgument) { - if(arg.value !== undefined && isDataFrameArgument(arg)) { - const other = arg.value.info.dataFrame?.domain?.get(arg.value.info.id) ?? DataFrameTop; + if(isDataFrameArgument(arg)) { + const otherDataFrame = arg.value.info.dataFrame.domain?.get(arg.value.info.id) ?? DataFrameTop; - result.operations.push({ + result.push({ operation: 'concatRows', operand: operand?.info.id, - args: { other: other } + args: { other: otherDataFrame } }); operand = undefined; - // Number of added rows is unknown if arguments cannot be resolved to constant (vector-like) value + // Number of added rows is undefined if arguments cannot be resolved to constant (vector-like) value } else if(resolveIdToArgValue(arg, info) !== undefined) { rows = rows !== undefined ? rows + 1 : undefined; } else { @@ -173,7 +180,7 @@ function mapDataFrameRowBind( } } if(rows === undefined || rows > 0) { - result.operations.push({ + result.push({ operation: 'addRows', operand: operand?.info.id, args: { rows: rows } @@ -182,6 +189,396 @@ function mapDataFrameRowBind( return result; } +function mapDataFrameHeadTail( + args: readonly RFunctionArgument[], + info: ResolveInfo +): DataFrameOperations[] | undefined { + const dataFrame = args[0]; + + if(!isDataFrameArgument(dataFrame)) { + return; + } else if(args.length === 1) { + return [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + const amountArg = args.find(arg => resolveIdToArgName(arg, info) === 'n') ?? args[1]; + const amountValue: unknown = resolveIdToArgValue(amountArg, info); + let rows: number | undefined = undefined; + let cols: number | undefined = undefined; + + if(typeof amountValue === 'number') { + rows = amountValue; + } else if(Array.isArray(amountValue) && amountValue.length <= 2 && amountValue.every(value => typeof value === 'number')) { + rows = amountValue[0]; + cols = amountValue[1]; + } + result.push({ + operation: rows === undefined || rows >= 0 ? 'subsetRows' : 'removeRows', + operand: dataFrame.value.info.id, + args: { rows: rows !== undefined ? Math.abs(rows) : undefined } + }); + + if(cols !== undefined) { + result.push({ + operation: cols >= 0 ? 'subsetCols' : 'removeCols', + operand: undefined, + args: { colnames: Array(Math.abs(cols)).fill(undefined) } + }); + } + return result; +} + +function mapDataFrameSubset( + args: readonly RFunctionArgument[], + info: ResolveInfo +): DataFrameOperations[] | undefined { + const dataFrame = args[0]; + + if(!isDataFrameArgument(dataFrame)) { + return; + } else if(args.length === 1) { + return [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + let operand: RNode | undefined = dataFrame.value; + + const filterArg = args.find(arg => resolveIdToArgName(arg, info) === 'subset') + ?? args.find(arg => arg !== dataFrame && resolveIdToArgName(arg, info) === undefined) + ?? EmptyArgument; + const filterValue = resolveIdToArgValue(filterArg, info); + + const selectArg = args.find(arg => resolveIdToArgName(arg, info) === 'select') + ?? args.find(arg => arg !== dataFrame && arg !== filterArg && resolveIdToArgName(arg, info) === undefined) + ?? EmptyArgument; + + const accessedNames = [...getUnresolvedSymbolsInExpression(filterArg, info), ...getUnresolvedSymbolsInExpression(selectArg, info)]; + const condition = typeof filterValue === 'boolean' ? filterValue : undefined; + let selectedCols: (string | undefined)[] | undefined = []; + let unselectedCols: (string | undefined)[] = []; + + if(selectArg !== EmptyArgument) { + if(selectArg.value?.type === RType.FunctionCall && selectArg.value.named && selectArg.value.functionName.content === 'c') { + selectArg.value.arguments.forEach(arg => { + if(arg !== EmptyArgument && arg.value?.type === RType.UnaryOp && arg.value.operator === '-' && info.idMap !== undefined) { + const operandArg = toUnnamedArgument(arg.value.operand, info.idMap); + unselectedCols?.push(resolveIdToArgValueSymbolName(operandArg, info)); + } else if(arg !== EmptyArgument && (arg.value?.type === RType.Symbol || arg.value?.type === RType.String)) { + selectedCols?.push(resolveIdToArgValueSymbolName(arg, info)); + } else { + selectedCols?.push(undefined); + } + }); + } else if(selectArg.value?.type === RType.UnaryOp && selectArg.value.operator === '-' && info.idMap !== undefined) { + const operandArg = toUnnamedArgument(selectArg.value.operand, info.idMap); + unselectedCols = [resolveIdToArgValueSymbolName(operandArg, info)]; + } else if(selectArg.value?.type === RType.Symbol || selectArg.value?.type === RType.String) { + selectedCols = [resolveIdToArgValueSymbolName(selectArg, info)]; + } else { + selectedCols = undefined; + } + } + + if(accessedNames.length > 0) { + result.push({ + operation: 'accessCols', + operand: operand?.info.id, + args: { columns: accessedNames } + }); + } + + if(filterArg !== EmptyArgument) { + result.push({ + operation: 'filterRows', + operand: operand?.info.id, + args: { condition: condition } + }); + operand = undefined; + } + + if(unselectedCols.length > 0) { + result.push({ + operation: 'removeCols', + operand: operand?.info.id, + args: { colnames: unselectedCols } + }); + operand = undefined; + } + if(selectedCols == undefined || selectedCols.length > 0) { + result.push({ + operation: 'subsetCols', + operand: operand?.info.id, + args: { colnames: selectedCols } + }); + operand = undefined; + } + return result; +} + +function mapDataFrameFilter( + args: readonly RFunctionArgument[], + info: ResolveInfo +): DataFrameOperations[] | undefined { + const dataFrame = args[0]; + + if(!isDataFrameArgument(dataFrame)) { + return; + } else if(args.length === 1) { + return [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + const filterArg = args[1]; + const filterValue = resolveIdToArgValue(filterArg, info); + const accessedNames = args.slice(1).flatMap(arg => getUnresolvedSymbolsInExpression(arg, info)); + const condition = typeof filterValue === 'boolean' && args.length === 2 ? filterValue : undefined; + + if(accessedNames.length > 0) { + result.push({ + operation: 'accessCols', + operand: dataFrame.value.info.id, + args: { columns: accessedNames } + }); + } + + result.push({ + operation: 'filterRows', + operand: dataFrame.value.info.id, + args: { condition: condition } + }); + return result; +} + +function mapDataFrameSelect( + args: readonly RFunctionArgument[], + info: ResolveInfo +): DataFrameOperations[] | undefined { + const dataFrame = args[0]; + + if(!isDataFrameArgument(dataFrame)) { + return; + } else if(args.length === 1) { + return [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + let operand: RNode | undefined = dataFrame.value; + const selectedCols: (string | undefined)[] = []; + const unselectedCols: (string | undefined)[] = []; + + for(const arg of args) { + if(arg !== dataFrame && arg !== EmptyArgument) { + if(arg.value?.type === RType.UnaryOp && arg.value.operator === '-' && info.idMap !== undefined) { + const operandArg = toUnnamedArgument(arg.value.operand, info.idMap); + unselectedCols.push(resolveIdToArgValueSymbolName(operandArg, info)); + } else { + selectedCols.push(resolveIdToArgValueSymbolName(arg, info)); + } + } + } + + if([...selectedCols, ...unselectedCols].some(col => col !== undefined)) { + result.push({ + operation: 'accessCols', + operand: operand?.info.id, + args: { columns: [...selectedCols, ...unselectedCols].filter(col => col !== undefined) } + }); + } + + if(unselectedCols.length > 0) { + result.push({ + operation: 'removeCols', + operand: operand?.info.id, + args: { colnames: unselectedCols } + }); + operand = undefined; + } + if(selectedCols.length > 0) { + result.push({ + operation: 'subsetCols', + operand: operand?.info.id, + args: { colnames: selectedCols } + }); + operand = undefined; + } + return result; +} + +function mapDataFrameMutate( + args: readonly RFunctionArgument[], + info: ResolveInfo +): DataFrameOperations[] | undefined { + const dataFrame = args[0]; + + if(!isDataFrameArgument(dataFrame)) { + return; + } else if(args.length === 1) { + return [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + const accessedNames = args.slice(1).flatMap(arg => getUnresolvedSymbolsInExpression(arg, info)); + const mutatedCols = args.slice(1).map(arg => resolveIdToArgName(arg, info)); + + if(accessedNames.length > 0) { + result.push({ + operation: 'accessCols', + operand: dataFrame.value.info.id, + args: { columns: accessedNames } + }); + } + + result.push({ + operation: 'mutateCols', + operand: dataFrame.value.info.id, + args: { colnames: mutatedCols } + }); + return result; +} + +function mapDataFrameGroupBy( + args: readonly RFunctionArgument[], + info: ResolveInfo +): DataFrameOperations[] | undefined { + const dataFrame = args[0]; + + if(!isDataFrameArgument(dataFrame)) { + return; + } else if(args.length === 1) { + return [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + const byArg = args[1]; + const byName = resolveIdToArgValueSymbolName(byArg, info); + + if(byName !== undefined) { + result.push({ + operation: 'accessCols', + operand: dataFrame.value.info.id, + args: { columns: [byName] } + }); + } + + result.push({ + operation: 'groupBy', + operand: dataFrame.value.info.id, + args: { by: typeof byName === 'string' ? byName : undefined } + }); + return result; +} + +function mapDataFrameSummarize( + args: readonly RFunctionArgument[], + info: ResolveInfo +): DataFrameOperations[] | undefined { + const dataFrame = args[0]; + + if(!isDataFrameArgument(dataFrame)) { + return; + } else if(args.length === 1) { + return [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + const accessedNames = args.slice(1).flatMap(arg => getUnresolvedSymbolsInExpression(arg, info)); + const summarizedCols = args.slice(1).map(arg => resolveIdToArgName(arg, info)); + + if(accessedNames.length > 0) { + result.push({ + operation: 'accessCols', + operand: dataFrame.value.info.id, + args: { columns: accessedNames } + }); + } + + result.push({ + operation: 'summarize', + operand: dataFrame.value.info.id, + args: { colnames: summarizedCols } + }); + return result; +} + +function mapDataFrameLeftJoin( + args: readonly RFunctionArgument[], + info: ResolveInfo, + minRows?: boolean +): DataFrameOperations[] | undefined { + const dataFrame = args[0]; + + if(!isDataFrameArgument(dataFrame)) { + return; + } else if(args.length === 1) { + return [{ + operation: 'identity', + operand: dataFrame.value.info.id, + args: {} + }]; + } + const result: DataFrameOperations[] = []; + const otherArg = args[1]; + const otherDataFrame = isDataFrameArgument(otherArg) ? otherArg.value.info.dataFrame.domain?.get(otherArg.value.info.id) : undefined; + + const byArg = args.find(arg => resolveIdToArgName(arg, info) === 'by') + ?? args.find(arg => arg !== dataFrame && resolveIdToArgName(arg, info) === undefined) + ?? EmptyArgument; + const byName = resolveIdToArgValueSymbolName(byArg, info); + + if(byName !== undefined) { + result.push({ + operation: 'accessCols', + operand: dataFrame.value.info.id, + args: { columns: [byName] } + }); + } + + result.push({ + operation: 'leftJoin', + operand: dataFrame.value.info.id, + args: { + other: otherDataFrame ?? DataFrameTop, + by: typeof byName === 'string' ? byName : undefined, + minRows: minRows + } + }); + return result; +} + +function mapDataFrameIdentity( + args: readonly RFunctionArgument[] +): DataFrameOperations[] | undefined { + const dataFrame = args.find(isDataFrameArgument); + + return [{ + operation: 'identity', + operand: dataFrame?.value.info.id, + args: {} + }]; +} + function getFunctionArguments( node: RFunctionCall, dfg: DataflowGraph @@ -198,19 +595,52 @@ function getFunctionArguments( return node.arguments; } -function getEffectiveArgs( - funct: keyof typeof SpecialFunctionArgumentsMapper, - args: readonly RFunctionArgument[] -): readonly RFunctionArgument[] { - const ignoredArgs = SpecialFunctionArgumentsMapper[funct] ?? []; +function getEffectiveArgs( + funct: DataFrameFunction, + args: readonly RFunctionArgument[] +): readonly RFunctionArgument[] { + const specialArgs: string[] = DataFrameFunctionMapper[funct].specialArgs; - return args.filter(arg => arg === EmptyArgument || arg.name === undefined || !ignoredArgs.includes(arg.name.content)); + return args.filter(arg => arg === EmptyArgument || arg.name === undefined || !specialArgs.includes(unescapeArgument(arg.name.content))); } -function isDataFrameArgument( - arg: RFunctionArgument -): boolean { - if(arg === EmptyArgument || arg.value === undefined) { +function getUnresolvedSymbolsInExpression( + expression: RNode | typeof EmptyArgument | undefined, + info: ResolveInfo +): string[] { + if(expression === undefined || expression === EmptyArgument || info.graph === undefined) { + return []; + } + switch(expression.type) { + case RType.ExpressionList: + return [...expression.children.flatMap(child => getUnresolvedSymbolsInExpression(child, info))]; + case RType.FunctionCall: + return [...expression.arguments.flatMap(arg => getUnresolvedSymbolsInExpression(arg, info))]; + case RType.UnaryOp: + return [...getUnresolvedSymbolsInExpression(expression.operand, info)]; + case RType.BinaryOp: + return [...getUnresolvedSymbolsInExpression(expression.lhs, info), ...getUnresolvedSymbolsInExpression(expression.rhs, info)]; + case RType.Access: + return [...getUnresolvedSymbolsInExpression(expression.accessed, info), ...expression.access.flatMap(arg => getUnresolvedSymbolsInExpression(arg, info))]; + case RType.Pipe: + return [...getUnresolvedSymbolsInExpression(expression.lhs, info), ...getUnresolvedSymbolsInExpression(expression.rhs, info)]; + case RType.Argument: + return [...getUnresolvedSymbolsInExpression(expression.value, info)]; + case RType.Symbol: + if(isUseVertex(info.graph.getVertex(expression.info.id)) && (info.graph.outgoingEdges(expression.info.id)?.size ?? 0) === 0) { + return [unescapeArgument(expression.content)]; + } else { + return []; + } + default: + return []; + } +} + +function isDataFrameArgument( + arg: RFunctionArgument | undefined +): arg is RArgument> & {value: RNode>} { + if(arg === EmptyArgument || arg?.value === undefined) { return false; } return arg.value.info.dataFrame?.domain?.get(arg.value.info.id) !== undefined; @@ -219,15 +649,3 @@ function isDataFrameArgument( function isValidColName(colname: string | undefined): boolean { return colname !== undefined && ColNamesRegex.test(colname); } - -function unescapeArgument(argument: undefined): undefined; -function unescapeArgument(argument: string): string; -function unescapeArgument(argument: string | undefined): string | undefined; -function unescapeArgument(argument: string | undefined): string | undefined { - if(argument === undefined) { - return undefined; - } else if(startAndEndsWith(argument, '`') || startAndEndsWith(argument, '"') || startAndEndsWith(argument, '\'')) { - return argument.slice(1, -1); - } - return argument; -} diff --git a/src/abstract-interpretation/data-frame/mappers/replacement-mapper.ts b/src/abstract-interpretation/data-frame/mappers/replacement-mapper.ts new file mode 100644 index 0000000000..becebb3078 --- /dev/null +++ b/src/abstract-interpretation/data-frame/mappers/replacement-mapper.ts @@ -0,0 +1,167 @@ +import type { ResolveInfo } from '../../../dataflow/environments/resolve-by-name'; +import type { DataflowGraph } from '../../../dataflow/graph/graph'; +import { toUnnamedArgument } from '../../../dataflow/internal/process/functions/call/argument/make-argument'; +import type { RNode } from '../../../r-bridge/lang-4.x/ast/model/model'; +import type { RIndexAccess, RNamedAccess } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-access'; +import type { RArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-argument'; +import { EmptyArgument } from '../../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import type { ParentInformation } from '../../../r-bridge/lang-4.x/ast/model/processing/decorate'; +import { RType } from '../../../r-bridge/lang-4.x/ast/model/type'; +import type { AbstractInterpretationInfo, DataFrameInfo, DataFrameOperations } from '../absint-info'; +import { resolveIdToArgStringVector, resolveIdToArgValue, resolveIdToArgValueSymbolName } from '../resolve-args'; +import { isStringBasedAccess } from '../semantics-mapper'; +import { mapDataFrameVariableAssignment } from './assignment-mapper'; + +const DataFrameAssignmentFunctionMapper = { + 'colnames': mapDataFrameColNamesAssignment, + 'names': mapDataFrameColNamesAssignment, + 'rownames': mapDataFrameRowNamesAssignment, + 'dimnames': mapDataFrameDimNamesAssignment +} as const satisfies Record; + +type DataFrameAssignmentFunctionMapping = ( + operand: RArgument, + expression: RNode, + info: ResolveInfo +) => DataFrameOperations[] | undefined; + +type DataFrameAssignmentFunction = keyof typeof DataFrameAssignmentFunctionMapper; + +export function mapDataFrameReplacement( + node: RNode, + dfg: DataflowGraph +): DataFrameInfo | undefined { + if(node.type === RType.BinaryOp && node.lhs !== undefined && node.rhs !== undefined) { + let operations: DataFrameOperations[] | undefined; + + if(node.lhs.type === RType.Access) { + if(node.lhs.accessed.type === RType.Symbol && node.lhs.access.every(access => access === EmptyArgument)) { + return mapDataFrameVariableAssignment(node.lhs.accessed, node.rhs); + } else if(isStringBasedAccess(node.lhs)) { + operations = mapDataFrameNamedColumnAssignment(node.lhs, node.rhs, { graph: dfg, idMap: dfg.idMap, full: true }); + } else { + operations = mapDataFrameIndexColRowAssignment(node.lhs, node.rhs, { graph: dfg, idMap: dfg.idMap, full: true }); + } + } else if(node.lhs.type === RType.FunctionCall && node.lhs.named && node.lhs.arguments.length === 1 && node.lhs.arguments[0] !== EmptyArgument) { + if(node.lhs.functionName.content in DataFrameAssignmentFunctionMapper && node.lhs.arguments.length > 0) { + const functionName = node.lhs.functionName.content as DataFrameAssignmentFunction; + const functionMapping = DataFrameAssignmentFunctionMapper[functionName]; + + operations = functionMapping(node.lhs.arguments[0], node.rhs, { graph: dfg, idMap: dfg.idMap, full: true }); + } else { + operations = mapDataFrameUnknownAssignment(node.lhs.arguments[0]); + } + } + if(operations !== undefined) { + return { type: 'expression', operations: operations }; + } + } +} + +function mapDataFrameNamedColumnAssignment( + access: RNamedAccess, + expression: RNode, + info: ResolveInfo +): DataFrameOperations[] | undefined { + const dataFrame = access.accessed; + + if(dataFrame.info.dataFrame?.domain?.get(dataFrame.info.id) === undefined) { + return; + } + const argName = resolveIdToArgValueSymbolName(access.access[0], info); + + return [{ + operation: 'assignCols', + operand: dataFrame.info.id, + args: { columns: argName ? [argName] : undefined } + }]; +} + +function mapDataFrameIndexColRowAssignment( + access: RIndexAccess, + expression: RNode, + info: ResolveInfo +): DataFrameOperations[] | undefined { + const dataFrame = access.accessed; + const args = access.access; + + if(dataFrame.info.dataFrame?.domain?.get(dataFrame.info.id) === undefined || args.every(arg => arg === EmptyArgument)) { + return; + } + const result: DataFrameOperations[] = []; + const rowArg = args.length < 2 ? undefined : args[0]; + const colArg = args.length < 2 ? args[0] : args[1]; + + if(rowArg !== undefined && rowArg !== EmptyArgument) { + const rowValue: unknown = resolveIdToArgValue(rowArg, info); + let rows: number[] | undefined = undefined; + + if(typeof rowValue === 'number') { + rows = [rowValue]; + } else if(Array.isArray(rowValue) && rowValue.every(row => typeof row === 'number')) { + rows = rowValue; + } + result.push({ + operation: 'assignRows', + operand: dataFrame.info.id, + args: { rows: rows } + }); + } + if(colArg !== undefined && colArg !== EmptyArgument) { + const colValue: unknown = resolveIdToArgValue(colArg, info); + let columns: string[] | number[] | undefined = undefined; + + if(typeof colValue === 'string') { + columns = [colValue]; + } else if(typeof colValue === 'number') { + columns = [colValue]; + } else if(Array.isArray(colValue) && (colValue.every(col => typeof col === 'string') || colValue.every(col => typeof col === 'number'))) { + columns = colValue; + } + result.push({ + operation: 'assignCols', + operand: dataFrame.info.id, + args: { columns: columns } + }); + } + return result; +} + +function mapDataFrameColNamesAssignment( + operand: RArgument, + expression: RNode, + info: ResolveInfo +): DataFrameOperations[] | undefined { + const argument = info.idMap !== undefined ? toUnnamedArgument(expression, info.idMap) : EmptyArgument; + const assignedNames = resolveIdToArgStringVector(argument, info); + + return [{ + operation: 'setColNames', + operand: operand.value?.info.id, + args: { colnames: assignedNames } + }]; +} + +function mapDataFrameRowNamesAssignment(): DataFrameOperations[] | undefined { + return undefined; +} + +function mapDataFrameDimNamesAssignment( + operand: RArgument +): DataFrameOperations[] { + return [{ + operation: 'setColNames', + operand: operand.value?.info.id, + args: { colnames: undefined } + }]; +} + +function mapDataFrameUnknownAssignment( + operand: RArgument +): DataFrameOperations[] { + return [{ + operation: 'unknownModify', + operand: operand.value?.info.id, + args: {} + }]; +} diff --git a/src/abstract-interpretation/data-frame/processor.ts b/src/abstract-interpretation/data-frame/processor.ts index 602f62c674..a2f49d2d65 100644 --- a/src/abstract-interpretation/data-frame/processor.ts +++ b/src/abstract-interpretation/data-frame/processor.ts @@ -1,40 +1,24 @@ -import type { BuiltInMappingName } from '../../dataflow/environments/built-in'; -import { DefaultBuiltinConfig } from '../../dataflow/environments/default-builtin-config'; -import { EdgeType } from '../../dataflow/graph/edge'; +import { BuiltInProcessorMapper, type BuiltInMappingName } from '../../dataflow/environments/built-in'; +import { edgeDoesNotIncludeType, edgeIncludesType, EdgeType } from '../../dataflow/graph/edge'; import { type DataflowGraph } from '../../dataflow/graph/graph'; +import { VertexType } from '../../dataflow/graph/vertex'; import type { NoInfo, RNode, RSingleNode } from '../../r-bridge/lang-4.x/ast/model/model'; import type { RAccess } from '../../r-bridge/lang-4.x/ast/model/nodes/r-access'; import type { RArgument } from '../../r-bridge/lang-4.x/ast/model/nodes/r-argument'; import type { RBinaryOp } from '../../r-bridge/lang-4.x/ast/model/nodes/r-binary-op'; -import type { RFunctionCall } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; +import { EmptyArgument, type RFunctionCall } from '../../r-bridge/lang-4.x/ast/model/nodes/r-function-call'; import type { RIfThenElse } from '../../r-bridge/lang-4.x/ast/model/nodes/r-if-then-else'; import type { RPipe } from '../../r-bridge/lang-4.x/ast/model/nodes/r-pipe'; import type { RUnaryOp } from '../../r-bridge/lang-4.x/ast/model/nodes/r-unary-op'; import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/processing/decorate'; import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; -import type { RFalse, RTrue } from '../../r-bridge/lang-4.x/convert-values'; import type { AbstractInterpretationInfo } from './absint-info'; import type { DataFrameDomain, DataFrameStateDomain } from './domain'; import { DataFrameTop, joinDataFrames } from './domain'; -import { applySemantics, ConstraintType, getConstraintTypes } from './semantics'; +import { applySemantics, ConstraintType, getConstraintType } from './semantics'; import { mapDataFrameSemantics } from './semantics-mapper'; -export type ConditionalDataFrameState = Record<'FD' | typeof RTrue | typeof RFalse, DataFrameStateDomain>; - -type ROperation = RFunctionCall | RUnaryOp | RBinaryOp | RAccess; -type RComplexNode = Exclude, RSingleNode>; - -type DataFrameProcessor> = ( - node: Node, - domain: DataFrameStateDomain, - dfg: DataflowGraph -) => DataFrameStateDomain; - -type DataFrameProcessorMapping = { - [Node in RComplexNode as Node['type']]: DataFrameProcessor; -} - const DataFrameProcessorMapper: DataFrameProcessorMapping = { [RType.ExpressionList]: processDataFrameNothing, [RType.FunctionCall]: processDataFrameOperation, @@ -51,15 +35,27 @@ const DataFrameProcessorMapper: DataFrameProcessorMapping = { [RType.Parameter]: processDataFrameNothing }; -export function processDataFrameLeaf( - node: RSingleNode, +type ROperation = RFunctionCall | RUnaryOp | RBinaryOp | RAccess; +type RComplexNode = Exclude, RSingleNode>; + +type DataFrameProcessorMapping = { + [Node in RComplexNode as Node['type']]: DataFrameProcessor; +} + +type DataFrameProcessor> = ( + node: Node, + domain: DataFrameStateDomain, + dfg: DataflowGraph +) => DataFrameStateDomain; + +export function processDataFrameLeaf( + node: RSingleNode, domain: DataFrameStateDomain, dfg: DataflowGraph ): DataFrameStateDomain { if(node.type === RType.Symbol) { resolveIdToAbstractValue(node.info.id, domain, dfg); } - updateDomainOfId(node, domain, dfg); return domain; } @@ -71,19 +67,18 @@ export function processDataFrameExpression; - const result = processor(node, domain, dfg); - updateDomainOfId(node, result, dfg); - - return result; + return processor(node, domain, dfg); } -function processDataFrameOperation( - node: ROperation, +function processDataFrameOperation( + node: ROperation, domain: DataFrameStateDomain, dfg: DataflowGraph ): DataFrameStateDomain { - const origin = DefaultBuiltinConfig.find(entry => entry.names.includes(node.lexeme)); - const processor = origin?.type === 'function' ? origin.processor as BuiltInMappingName : 'builtin:default'; + const linked = dfg.getLinked(node.info.id); + const vertex = linked?.[0] !== undefined ? dfg.getVertex(linked[0]) : undefined; + const origin = vertex?.tag === VertexType.FunctionCall && Array.isArray(vertex.origin) ? vertex.origin : undefined; + const processor = origin?.[0] !== undefined && origin[0] in BuiltInProcessorMapper ? origin[0] as BuiltInMappingName : 'builtin:default'; node.info.dataFrame = mapDataFrameSemantics(node, dfg, processor); if(node.info.dataFrame?.type === 'assignment') { @@ -101,23 +96,32 @@ function processDataFrameOperation( const operandValue = operation.operand ? resolveIdToAbstractValue(operation.operand, domain, dfg) : value; value = applySemantics(operation.operation, operandValue ?? DataFrameTop, operation.args); - if(operation.operand !== undefined && getConstraintTypes(operation.operation).some(type => type === ConstraintType.OperandPrecondition || type === ConstraintType.OperandModification)) { + if(operation.operand !== undefined && getConstraintType(operation.operation) === ConstraintType.OperandModification) { assignAbstractValueToId(operation.operand, value, domain, dfg); } } - if(node.info.dataFrame.operations.some(operation => getConstraintTypes(operation.operation).includes(ConstraintType.ResultPostcondition))) { + if(node.info.dataFrame.operations.some(operation => getConstraintType(operation.operation) === ConstraintType.ResultPostcondition)) { domain.set(node.info.id, value); } + } else if(processor === 'builtin:pipe') { + return processDataFramePipe(node, domain, dfg); } return domain; } -function processDataFramePipe( - node: RPipe, +function processDataFramePipe( + node: RPipe | ROperation, domain: DataFrameStateDomain, dfg: DataflowGraph ): DataFrameStateDomain { - const value = resolveIdToAbstractValue(node.rhs.info.id, domain, dfg); + let rhs: RNode | undefined; + + if(node.type === RType.Pipe || node.type === RType.BinaryOp) { + rhs = node.rhs; + } else if(node.type === RType.FunctionCall && node.arguments[1] !== EmptyArgument) { + rhs = node.arguments[1]?.value; + } + const value = rhs ? resolveIdToAbstractValue(rhs.info.id, domain, dfg) : undefined; if(value !== undefined) { domain.set(node.info.id, value); @@ -125,8 +129,8 @@ function processDataFramePipe( return domain; } -function processDataFrameArgument( - node: RArgument, +function processDataFrameArgument( + node: RArgument, domain: DataFrameStateDomain, dfg: DataflowGraph ): DataFrameStateDomain { @@ -140,8 +144,8 @@ function processDataFrameArgument( return domain; } -function processDataFrameIfThenElse( - node: RIfThenElse, +function processDataFrameIfThenElse( + node: RIfThenElse, domain: DataFrameStateDomain, dfg: DataflowGraph ): DataFrameStateDomain { @@ -157,28 +161,31 @@ function processDataFrameIfThenElse( return domain; } -function processDataFrameNothing( - node: RComplexNode, +function processDataFrameNothing( + node: RComplexNode, domain: DataFrameStateDomain ): DataFrameStateDomain { return domain; } function assignAbstractValueToId(id: NodeId, value: DataFrameDomain, domain: DataFrameStateDomain, dfg: DataflowGraph): void { - dfg.outgoingEdges(id)?.entries() - .filter(([, edge]) => edge.types === EdgeType.Reads) + domain.set(id, value); + getDefinitionOrigin(id, dfg).forEach(origin => domain.set(origin, value)); +} + +function getDefinitionOrigin(id: NodeId, dfg: DataflowGraph): NodeId[] { + return dfg.outgoingEdges(id)?.entries() + .filter(([, { types }]) => edgeIncludesType(types, EdgeType.Reads | EdgeType.DefinedByOnCall) && edgeDoesNotIncludeType(types, EdgeType.NonStandardEvaluation)) .map(([id]) => id) - .forEach(origin => domain.set(origin, value)); + .filter(id => dfg.getVertex(id)?.tag === VertexType.VariableDefinition) + .toArray() ?? []; } function resolveIdToAbstractValue(id: NodeId, domain: DataFrameStateDomain, dfg: DataflowGraph): DataFrameDomain | undefined { if(domain.has(id)) { return domain.get(id); } - const origins = dfg.outgoingEdges(id)?.entries() - .filter(([, edge]) => edge.types === EdgeType.Reads) - .map(([id]) => domain.get(id)) - .toArray(); + const origins = getDefinitionOrigin(id, dfg).map(id => domain.get(id)); if(origins !== undefined && origins.length > 0 && origins.some(origin => origin !== undefined)) { const result = joinDataFrames(...origins.map(origin => origin ?? DataFrameTop)); @@ -188,11 +195,11 @@ function resolveIdToAbstractValue(id: NodeId, domain: DataFrameStateDomain, dfg: } } -function updateDomainOfId(id: NodeId | RNode, domain: DataFrameStateDomain, dfg: DataflowGraph): void { - const node: RNode | undefined = typeof id === 'object' ? id : dfg.idMap?.get(id); +function updateDomainOfId(id: NodeId, domain: DataFrameStateDomain, dfg: DataflowGraph): void { + const node: RNode | undefined = dfg.idMap?.get(id); if(node !== undefined) { - node.info.dataFrame ??= { type: 'other' }; + node.info.dataFrame ??= {}; node.info.dataFrame.domain = new Map(domain); } } diff --git a/src/abstract-interpretation/data-frame/resolve-args.ts b/src/abstract-interpretation/data-frame/resolve-args.ts index 85dbff8fef..49d04369bb 100644 --- a/src/abstract-interpretation/data-frame/resolve-args.ts +++ b/src/abstract-interpretation/data-frame/resolve-args.ts @@ -5,6 +5,7 @@ import type { ParentInformation } from '../../r-bridge/lang-4.x/ast/model/proces import type { NodeId } from '../../r-bridge/lang-4.x/ast/model/processing/node-id'; import { RType } from '../../r-bridge/lang-4.x/ast/model/type'; import { unwrapRValue, unwrapRValueToString, unwrapRVector } from '../../util/r-value'; +import { startAndEndsWith } from '../../util/strings'; /** * Returns the argument name of a function argument @@ -12,7 +13,7 @@ import { unwrapRValue, unwrapRValueToString, unwrapRVector } from '../../util/r- export function resolveIdToArgName(id: NodeId | RArgument, info: ResolveInfo): string | undefined { const node = resolveIdToArgument(id, info); - return node?.name?.content; + return unescapeArgument(node?.name?.content); } /** @@ -65,7 +66,7 @@ export function resolveIdToArgValueSymbolName(id: NodeId | RArgument, { graph, } return undefined; } + +export function unescapeArgument(argument: undefined): undefined; +export function unescapeArgument(argument: string): string; +export function unescapeArgument(argument: string | undefined): string | undefined; +export function unescapeArgument(argument: string | undefined): string | undefined { + if(argument === undefined) { + return undefined; + } else if(startAndEndsWith(argument, '`') || startAndEndsWith(argument, '"') || startAndEndsWith(argument, '\'')) { + return argument.slice(1, -1); + } + return argument; +} diff --git a/src/abstract-interpretation/data-frame/semantics-mapper.ts b/src/abstract-interpretation/data-frame/semantics-mapper.ts index 6fdc648e80..9a1880c024 100644 --- a/src/abstract-interpretation/data-frame/semantics-mapper.ts +++ b/src/abstract-interpretation/data-frame/semantics-mapper.ts @@ -7,20 +7,22 @@ import type { DataFrameInfo } from './absint-info'; import { mapDataFrameAccess } from './mappers/access-mapper'; import { mapDataFrameAssignment } from './mappers/assignment-mapper'; import { mapDataFrameFunctionCall } from './mappers/function-mapper'; +import { mapDataFrameReplacement } from './mappers/replacement-mapper'; const DataFrameProcessorMapper = { - 'builtin:default': mapDataFrameFunctionCall, - 'builtin:assignment': mapDataFrameAssignment, - 'builtin:access': mapDataFrameAccess, + 'builtin:default': mapDataFrameFunctionCall, + 'builtin:assignment': mapDataFrameAssignment, + 'builtin:replacement': mapDataFrameReplacement, + 'builtin:access': mapDataFrameAccess, } as const satisfies Partial>; -type DataFrameProcessor = ( - node: RNode, +type DataFrameProcessor = ( + node: RNode, dfg: DataflowGraph ) => DataFrameInfo | undefined; -export function mapDataFrameSemantics( - node: RNode, +export function mapDataFrameSemantics( + node: RNode, dfg: DataflowGraph, origin: BuiltInMappingName ): DataFrameInfo | undefined { @@ -31,8 +33,8 @@ export function mapDataFrameSemantics( } } -export function isStringBasedAccess( - access: RAccess -): access is RNamedAccess { +export function isStringBasedAccess( + access: RAccess +): access is RNamedAccess { return access.operator === '$' || access.operator === '@'; } diff --git a/src/abstract-interpretation/data-frame/semantics.ts b/src/abstract-interpretation/data-frame/semantics.ts index 68af248f7a..8d7c8d3c0d 100644 --- a/src/abstract-interpretation/data-frame/semantics.ts +++ b/src/abstract-interpretation/data-frame/semantics.ts @@ -1,5 +1,5 @@ import type { DataFrameDomain } from './domain'; -import { addInterval, ColNamesTop, DataFrameTop, includeZeroInterval, IntervalTop, joinColNames, joinInterval, subtractColNames, subtractInterval } from './domain'; +import { addInterval, ColNamesTop, DataFrameTop, includeInfinityInterval, includeZeroInterval, IntervalBottom, IntervalTop, joinColNames, maxInterval, meetColNames, minInterval, subtractColNames, subtractInterval } from './domain'; export enum ConstraintType { /** The inferred constraints must hold for the operand at the point of the operation */ @@ -10,32 +10,40 @@ export enum ConstraintType { ResultPostcondition } -type DataFrameSemanticsApplier = ( - value: DataFrameDomain, - args: Arguments -) => DataFrameDomain; +const DataFrameSemanticsMapper = { + 'create': { apply: applyCreateSemantics, type: ConstraintType.ResultPostcondition }, + 'unknownCreate': { apply: applyUnknownSemantics, type: ConstraintType.ResultPostcondition }, + 'accessCols': { apply: applyAccessColsSemantics, type: ConstraintType.OperandPrecondition }, + 'accessRows': { apply: applyAccessRowsSemantics, type: ConstraintType.OperandPrecondition }, + 'assignCols': { apply: applyAssignColsSemantics, type: ConstraintType.OperandModification }, + 'assignRows': { apply: applyAssignRowsSemantics, type: ConstraintType.OperandModification }, + 'setColNames': { apply: applySetColNamesSemantics, type: ConstraintType.OperandModification }, + 'unknownModify': { apply: applyUnknownSemantics, type: ConstraintType.OperandModification }, + 'addCols': { apply: applyAddColsSemantics, type: ConstraintType.ResultPostcondition }, + 'addRows': { apply: applyAddRowsSemantics, type: ConstraintType.ResultPostcondition }, + 'removeCols': { apply: applyRemoveColsSemantics, type: ConstraintType.ResultPostcondition }, + 'removeRows': { apply: applyRemoveRowsSemantics, type: ConstraintType.ResultPostcondition }, + 'concatCols': { apply: applyConcatColsSemantics, type: ConstraintType.ResultPostcondition }, + 'concatRows': { apply: applyConcatRowsSemantics, type: ConstraintType.ResultPostcondition }, + 'subsetCols': { apply: applySubsetColsSemantics, type: ConstraintType.ResultPostcondition }, + 'subsetRows': { apply: applySubsetRowsSemantics, type: ConstraintType.ResultPostcondition }, + 'filterRows': { apply: applyFilterRowsSemantics, type: ConstraintType.ResultPostcondition }, + 'mutateCols': { apply: applyMutateColsSemantics, type: ConstraintType.ResultPostcondition }, + 'groupBy': { apply: applyGroupBySemantics, type: ConstraintType.ResultPostcondition }, + 'summarize': { apply: applySummarizeSemantics, type: ConstraintType.ResultPostcondition }, + 'leftJoin': { apply: applyLeftJoinSemantics, type: ConstraintType.ResultPostcondition }, + 'identity': { apply: applyIdentitySemantics, type: ConstraintType.ResultPostcondition } +} as const satisfies Record>; type DataFrameSemanticsMapperInfo = { readonly apply: DataFrameSemanticsApplier, - readonly types: ConstraintType[] + readonly type: ConstraintType } -const DataFrameSemanticsMapper = { - 'create': { apply: applyCreateSemantics, types: [ConstraintType.ResultPostcondition] }, - 'accessCol': { apply: applyAccessColSemantics, types: [ConstraintType.OperandPrecondition] }, - 'accessRow': { apply: applyAccessRowSemantics, types: [ConstraintType.OperandPrecondition] }, - 'assignCol': { apply: applyAssignColSemantics, types: [ConstraintType.OperandModification] }, - 'assignRow': { apply: applyAssignRowSemantics, types: [ConstraintType.OperandModification] }, - 'setColNames': { apply: applySetColNamesSemantics, types: [ConstraintType.OperandModification] }, - 'addCols': { apply: applyAddColsSemantics, types: [ConstraintType.ResultPostcondition] }, - 'addRows': { apply: applyAddRowsSemantics, types: [ConstraintType.ResultPostcondition] }, - 'removeCols': { apply: applyRemoveColsSemantics, types: [ConstraintType.ResultPostcondition] }, - 'removeRows': { apply: applyRemoveRowsSemantics, types: [ConstraintType.ResultPostcondition] }, - 'concatCols': { apply: applyConcatColsSemantics, types: [ConstraintType.ResultPostcondition] }, - 'concatRows': { apply: applyConcatRowsSemantics, types: [ConstraintType.ResultPostcondition] }, - 'identity': { apply: applyIdentitySemantics, types: [ConstraintType.ResultPostcondition] }, - 'unknown': { apply: applyUnknownSemantics, types: [ConstraintType.ResultPostcondition] } -} as const satisfies Record>; +type DataFrameSemanticsApplier = ( + value: DataFrameDomain, + args: Arguments +) => DataFrameDomain; export type DataFrameOperationName = keyof typeof DataFrameSemanticsMapper; export type DataFrameOperationArgs = Parameters[1]; @@ -50,8 +58,8 @@ export function applySemantics( return applier.apply(value, args); } -export function getConstraintTypes(operation: DataFrameOperationName): ConstraintType[] { - return DataFrameSemanticsMapper[operation].types; +export function getConstraintType(operation: DataFrameOperationName): ConstraintType { + return DataFrameSemanticsMapper[operation].type; } function applyCreateSemantics( @@ -65,7 +73,7 @@ function applyCreateSemantics( }; } -function applyAccessColSemantics( +function applyAccessColsSemantics( value: DataFrameDomain, { columns }: { columns: string[] | number[] | undefined } ): DataFrameDomain { @@ -77,60 +85,62 @@ function applyAccessColSemantics( } else if(columns?.every(col => typeof col === 'number')) { return { ...value, - cols: columns.reduce((a, b) => joinInterval(a, [b, b]), value.cols) + cols: columns.reduce((a, b) => maxInterval(a, [b, b]), value.cols) }; } return value; } -function applyAccessRowSemantics( +function applyAccessRowsSemantics( value: DataFrameDomain, { rows }: { rows: number[] | undefined } ): DataFrameDomain { if(rows !== undefined) { return { ...value, - rows: rows.reduce((a, b) => joinInterval(a, [b, b]), value.rows) + rows: rows.reduce((a, b) => maxInterval(a, [b, b]), value.rows) }; } return value; } -function applyAssignColSemantics( +function applyAssignColsSemantics( value: DataFrameDomain, { columns }: { columns: string[] | number[] | undefined } ): DataFrameDomain { if(columns?.every(col => typeof col === 'string')) { return { ...value, - colnames: joinColNames(value.colnames, columns) + colnames: joinColNames(value.colnames, columns), + cols: maxInterval(addInterval(value.cols, [0, columns.length]), [columns.length, columns.length]) }; } else if(columns?.every(col => typeof col === 'number')) { return { ...value, - cols: columns.reduce((a, b) => joinInterval(a, [b, b]), value.cols) + colnames: ColNamesTop, + cols: columns.reduce((a, b) => maxInterval(a, [b, b]), value.cols) }; } return { ...value, colnames: ColNamesTop, - cols: IntervalTop + cols: includeInfinityInterval(value.cols) }; } -function applyAssignRowSemantics( +function applyAssignRowsSemantics( value: DataFrameDomain, { rows }: { rows: number[] | undefined } ): DataFrameDomain { if(rows !== undefined) { return { ...value, - rows: rows.reduce((a, b) => joinInterval(a, [b, b]), value.rows) + rows: rows.reduce((a, b) => maxInterval(a, [b, b]), value.rows) }; } return { ...value, - rows: IntervalTop + rows: includeInfinityInterval(value.rows) }; } @@ -141,7 +151,6 @@ function applySetColNamesSemantics( return { ...value, colnames: colnames?.every(name => name !== undefined) ? colnames : ColNamesTop, - cols: colnames !== undefined ? [colnames.length, colnames.length] : IntervalTop }; } @@ -208,6 +217,82 @@ function applyConcatRowsSemantics( }; } +function applySubsetColsSemantics( + value: DataFrameDomain, + { colnames }: { colnames: (string | undefined)[] | undefined } +): DataFrameDomain { + return { + ...value, + colnames: colnames?.every(col => col !== undefined) ? meetColNames(value.colnames, colnames) : value.colnames, + cols: colnames !== undefined ? minInterval(value.cols, [colnames.length, colnames.length]) : value.cols + }; +} + +function applySubsetRowsSemantics( + value: DataFrameDomain, + { rows }: { rows: number | undefined } +): DataFrameDomain { + return { + ...value, + rows: rows !== undefined ? minInterval(value.rows, [rows, rows]) : value.rows + }; +} + +function applyFilterRowsSemantics( + value: DataFrameDomain, + { condition }: { condition: boolean | undefined } +): DataFrameDomain { + return { + ...value, + rows: condition ? value.rows : condition === false ? [0, 0] : includeZeroInterval(value.rows) + }; +} + +function applyMutateColsSemantics( + value: DataFrameDomain, + { colnames }: { colnames: (string | undefined)[] | undefined } +): DataFrameDomain { + return { + ...value, + colnames: colnames?.every(col => col !== undefined) ? joinColNames(value.colnames, colnames) : ColNamesTop, + cols: colnames !== undefined ? maxInterval(addInterval(value.cols, [0, colnames.length]), [colnames.length, colnames.length]): includeInfinityInterval(value.rows) + }; +} + +function applyGroupBySemantics( + value: DataFrameDomain, + _args: { by: string | undefined } +): DataFrameDomain { + return { + ...value, + rows: includeZeroInterval(value.rows) + }; +} + +function applySummarizeSemantics( + value: DataFrameDomain, + { colnames }: { colnames: (string | undefined)[] | undefined } +): DataFrameDomain { + return { + ...value, + colnames: colnames?.every(col => col !== undefined) ? joinColNames(value.colnames, colnames) : ColNamesTop, + cols: colnames !== undefined ? minInterval(addInterval(value.cols, [0, colnames.length]), [colnames.length, Infinity]) : includeInfinityInterval(value.rows), + rows: value.rows !== IntervalBottom && value.rows[0] > 0 ? [1, 1] : value.rows + }; +} + +function applyLeftJoinSemantics( + value: DataFrameDomain, + { other, minRows }: { other: DataFrameDomain, by: string | undefined, minRows?: boolean } +): DataFrameDomain { + return { + ...value, + colnames: joinColNames(value.colnames, other.colnames), + cols: subtractInterval(addInterval(value.cols, other.cols), [1, 1]), + rows: minRows ? minInterval(value.rows, other.rows) : value.rows + }; +} + function applyIdentitySemantics( value: DataFrameDomain, _args: Record @@ -217,7 +302,7 @@ function applyIdentitySemantics( function applyUnknownSemantics( _value: DataFrameDomain, - _args: { creation?: boolean, modifyInplace?: boolean } + _args: Record ): DataFrameDomain { return DataFrameTop; } diff --git a/src/dataflow/environments/resolve-by-name.ts b/src/dataflow/environments/resolve-by-name.ts index 41e35243be..265e9da806 100644 --- a/src/dataflow/environments/resolve-by-name.ts +++ b/src/dataflow/environments/resolve-by-name.ts @@ -348,6 +348,18 @@ export function resolveIdToValue(id: NodeId | RNodeWithParent, { environment, gr case RType.Number: case RType.Logical: return [node.content]; + case RType.UnaryOp: + if(full && node.operator === '-') { + const arg = resolveIdToValue(node.operand, { environment, graph, idMap, full }); + const argValue = arg?.length === 1 ? arg[0] : undefined; + + if(isRNumberValue(argValue)) { + return [{ ...argValue, num: -argValue.num }]; + } else if(Array.isArray(argValue) && argValue.every(isRNumberValue)) { + return [argValue.map(element => ({ ...element, num: -element.num }))]; + } + } + return undefined; case RType.BinaryOp: if(full && node.operator === ':' && (node.lhs.type === RType.Number || node.lhs.type === RType.Symbol) && (node.rhs.type === RType.Symbol || node.rhs.type === RType.Number)) { const leftArg = resolveIdToValue(node.lhs.info.id, { environment, graph, idMap, full }); diff --git a/src/util/cfg/cfg.ts b/src/util/cfg/cfg.ts index d5787e09f4..4080c26b2a 100644 --- a/src/util/cfg/cfg.ts +++ b/src/util/cfg/cfg.ts @@ -305,10 +305,6 @@ function cfgWhile(whileLoop: RWhileLoop, condition: ControlFl } } - for(const entryPoint of body.entryPoints) { - graph.addEdge(whileLoop.info.id, entryPoint, { label: 'FD' }); - } - for(const next of [...body.nexts, ...body.exitPoints]) { graph.addEdge(whileLoop.info.id, next, { label: 'FD' }); } diff --git a/test/functionality/abstract-interpretation/data-frame/data-frame.ts b/test/functionality/abstract-interpretation/data-frame/data-frame.ts index 9bbe0188b4..5e00c4ac2d 100644 --- a/test/functionality/abstract-interpretation/data-frame/data-frame.ts +++ b/test/functionality/abstract-interpretation/data-frame/data-frame.ts @@ -2,7 +2,7 @@ import { assert, beforeAll, test } from 'vitest'; import type { AbstractInterpretationInfo } from '../../../../src/abstract-interpretation/data-frame/absint-info'; import { performDataFrameAbsint } from '../../../../src/abstract-interpretation/data-frame/abstract-interpretation'; import type { DataFrameDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; -import { DataFrameTop, leqColNames, leqInterval } from '../../../../src/abstract-interpretation/data-frame/domain'; +import { DataFrameTop, equalColNames, equalInterval, leqColNames, leqInterval } from '../../../../src/abstract-interpretation/data-frame/domain'; import { PipelineExecutor } from '../../../../src/core/pipeline-executor'; import type { TREE_SITTER_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; import { createDataflowPipeline, DEFAULT_DATAFLOW_PIPELINE } from '../../../../src/core/steps/pipeline/default-pipelines'; @@ -13,7 +13,7 @@ import type { ParentInformation } from '../../../../src/r-bridge/lang-4.x/ast/mo import { RType } from '../../../../src/r-bridge/lang-4.x/ast/model/type'; import type { KnownParser } from '../../../../src/r-bridge/parser'; import { requestFromInput } from '../../../../src/r-bridge/retriever'; -import type { RShell } from '../../../../src/r-bridge/shell'; +import { type RShell } from '../../../../src/r-bridge/shell'; import type { SingleSlicingCriterion } from '../../../../src/slicing/criterion/parse'; import { slicingCriterionToId } from '../../../../src/slicing/criterion/parse'; import { assertUnreachable, guard, isNotUndefined } from '../../../../src/util/assert'; @@ -40,6 +40,22 @@ export const DataFrameTestOverapproximation = { rows: DomainMatchingType.Overapproximation }; +type DomainPredicateMapping = { + [K in keyof DataFrameDomain]: (X1: DataFrameDomain[K], X2: DataFrameDomain[K]) => boolean +} + +const EqualFunctions: DomainPredicateMapping = { + colnames: equalColNames, + cols: equalInterval, + rows: equalInterval +}; + +const LeqFunctions: DomainPredicateMapping = { + colnames: leqColNames, + cols: leqInterval, + rows: leqInterval +}; + /** Stores the inferred data frame constraints and AST node for a tested slicing criterion */ interface CriterionTestEntry { criterion: SingleSlicingCriterion, @@ -65,9 +81,9 @@ export function assertDataFrameDomain( guard(isNotUndefined(result), 'Result cannot be undefined'); const [value] = getInferredDomainForCriterion(result, criterion); - assert.deepStrictEqual(value.colnames, expect.colnames, 'column names differ'); - assert.deepStrictEqual(value.cols, expect.cols, 'column count differs'); - assert.deepStrictEqual(value.rows, expect.rows, 'row count differs'); + assertDomainMatching('colnames', value.colnames, expect.colnames, DomainMatchingType.Exact); + assertDomainMatching('cols', value.cols, expect.cols, DomainMatchingType.Exact); + assertDomainMatching('rows', value.rows, expect.rows, DomainMatchingType.Exact); }); } @@ -118,9 +134,9 @@ export function testDataFrameDomainAgainstReal( const cols = getRealDomainFromOutput('cols', criterion, output); const rows = getRealDomainFromOutput('rows', criterion, output); - assertDomainMatching('colnames', value.colnames, colnames, leqColNames, options.colnames); - assertDomainMatching('cols', value.cols, cols, leqInterval, options.cols); - assertDomainMatching('rows', value.rows, rows, leqInterval, options.rows); + assertDomainMatching('colnames', value.colnames, colnames, options.colnames); + assertDomainMatching('cols', value.cols, cols, options.cols); + assertDomainMatching('rows', value.rows, rows, options.rows); } }); } @@ -129,14 +145,16 @@ function assertDomainMatching boolean, matchingType: DomainMatchingType ): void { + const equalFunction = EqualFunctions[type]; + const leqFunction = LeqFunctions[type]; + switch(matchingType) { case DomainMatchingType.Exact: - return assert.deepStrictEqual(actual, expected, `${type} differs`); + return assert.ok(equalFunction(actual, expected), `${type} differs: expected ${JSON.stringify(actual)} to equal ${JSON.stringify(expected)}`); case DomainMatchingType.Overapproximation: - return assert.isTrue(leqFunction(expected, actual), `${type} is no over-approximation`); + return assert.ok(leqFunction(expected, actual), `${type} is no over-approximation: expected ${JSON.stringify(expected)} to be an over-approximation of ${JSON.stringify(actual)}`); default: assertUnreachable(matchingType); } diff --git a/test/functionality/abstract-interpretation/data-frame/inference.test.ts b/test/functionality/abstract-interpretation/data-frame/inference.test.ts index 7fd283aaf0..cf559c137f 100644 --- a/test/functionality/abstract-interpretation/data-frame/inference.test.ts +++ b/test/functionality/abstract-interpretation/data-frame/inference.test.ts @@ -1,12 +1,15 @@ -import { describe } from 'vitest'; +import { afterAll, beforeAll, describe } from 'vitest'; import type { DataFrameDomain } from '../../../../src/abstract-interpretation/data-frame/domain'; import { ColNamesTop, DataFrameTop, IntervalTop } from '../../../../src/abstract-interpretation/data-frame/domain'; +import { amendConfig, defaultConfigOptions } from '../../../../src/config'; import type { SingleSlicingCriterion } from '../../../../src/slicing/criterion/parse'; import { withShell } from '../../_helper/shell'; import type { DataFrameTestOptions } from './data-frame'; import { assertDataFrameDomain, DataFrameTestOverapproximation, DomainMatchingType, testDataFrameDomainAgainstReal } from './data-frame'; describe.sequential('Data Frame Abstract Interpretation', withShell(shell => { + const skipDplyr = true; + function testDataFrameDomain( code: string, criteria: ([SingleSlicingCriterion, DataFrameDomain] | [SingleSlicingCriterion, DataFrameDomain, Partial])[] @@ -15,6 +18,14 @@ describe.sequential('Data Frame Abstract Interpretation', withShell(shell => { testDataFrameDomainAgainstReal(shell, code, criteria.map(entry => entry.length === 3 ? [entry[0], entry[2]] : entry[0])); } + beforeAll(() => { + amendConfig({ solver: { pointerTracking: false } }); + }); + + afterAll(() => { + amendConfig({ solver: { pointerTracking: defaultConfigOptions.solver.pointerTracking } }); + }); + testDataFrameDomain( 'df <- data.frame(id = 1:5, age = c(25, 32, 35, 40, 45), score = c(90, 85, 88, 92, 95), row.names = NULL)', [['1@df', { colnames: ['id', 'age', 'score'], cols: [3, 3], rows: [5, 5] }]] @@ -47,6 +58,17 @@ df2 <- df1 ] ); + testDataFrameDomain( + ` +df1 <- data.frame(id = 1:3, label = c("A", "B", "C")) +df2 <- as.data.frame(df1) + `.trim(), + [ + ['1@df1', { colnames: ['id', 'label'], cols: [2, 2], rows: [3, 3] }], + ['2@df2', { colnames: ['id', 'label'], cols: [2, 2], rows: [3, 3] }] + ] + ); + testDataFrameDomain( 'df <- read.csv(text = "id,age\\n1,30\\n2,50\\n3,45")', [['1@df', DataFrameTop, DataFrameTestOverapproximation]] @@ -113,54 +135,278 @@ print(df) testDataFrameDomain( ` -df <- data.frame(id = 1:5) -for (i in 1:5) { - df[["name"]] +i <- 5 +df <- if (i == 0) { + data.frame(id = 1:3) +} else if (i == 1) { + data.frame(id = 1:5) +} else if (i == 2) { + data.frame(name = 1:10) +} else { + data.frame(id = 1, name = 1:5) } -df[10, ] print(df) `.trim(), - [['6@df', { colnames: ['id', 'name'], cols: [1, 1], rows: [5, 10] }, { colnames: DomainMatchingType.Overapproximation, rows: DomainMatchingType.Overapproximation }]] + [['11@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 10] }, DataFrameTestOverapproximation]] ); testDataFrameDomain( ` df <- data.frame(id = 1:5) for (i in 1:5) { - break - df[["name"]] + df[2] <- 6:10 } -df[10, ] +df[10, ] <- c(6, 11) print(df) `.trim(), - [['7@df', { colnames: ['id'], cols: [1, 1], rows: [5, 10] }, { colnames: DomainMatchingType.Overapproximation, rows: DomainMatchingType.Overapproximation }]] + [['6@df', { colnames: ColNamesTop, cols: [1, 2], rows: [10, 10] }, { colnames: DomainMatchingType.Overapproximation, cols: DomainMatchingType.Overapproximation }]] ); testDataFrameDomain( ` df <- data.frame(id = 1:5) while (TRUE) { - df[["name"]] + df[2] <- 6:10 break } -df[10, ] +df[10, ] <- c(6, 11) print(df) - `.trim(), - [['7@df', { colnames: ['id', 'name'], cols: [1, 1], rows: [5, 10] }, DataFrameTestOverapproximation]] + `.trim(), + [['7@df', { colnames: ColNamesTop, cols: [1, 2], rows: [10, 10] }, { colnames: DomainMatchingType.Overapproximation, cols: DomainMatchingType.Overapproximation }]] ); assertDataFrameDomain( shell, ` df <- data.frame(id = 1:5) repeat { - df[["name"]] + df[2] <- 6:10 } -df[10, ] +df[10, ] <- c(6, 11) print(df) - `.trim(), + `.trim(), [['6@df', DataFrameTop]] ); + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df["id"] + `.trim(), + [['2@result', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }]] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[1] + `.trim(), + [['2@result', { colnames: ['id', 'name'], cols: [1, 1], rows: [3, 3] }, { colnames: DomainMatchingType.Overapproximation }]] + ); + + assertDataFrameDomain( + shell, ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[1, 1] + `.trim(), + [['2@result', DataFrameTop]] + ); + + assertDataFrameDomain( + shell, ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[, 1] + `.trim(), + [['2@result', DataFrameTop,]] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[1, ] + `.trim(), + [['2@result', { colnames: ['id', 'name'], cols: [2, 2], rows: [1, 1] }]] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[1, c(1, 2)] + `.trim(), + [['2@result', { colnames: ['id', 'name'], cols: [2, 2], rows: [1, 1] }]] + ); + + assertDataFrameDomain( + shell, ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[c(1, 2), 1] + `.trim(), + [['2@result', DataFrameTop]] + ); + + assertDataFrameDomain( + shell, ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[["id"]] + `.trim(), + [['2@result', DataFrameTop]] + ); + + assertDataFrameDomain( + shell, ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[[1]] + `.trim(), + [['2@result', DataFrameTop]] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[-1, "id", drop = FALSE] + `.trim(), + [['2@result', { colnames: ['id'], cols: [1, 1], rows: [2, 2] }]] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[c(-1, -2), -1, drop = FALSE] + `.trim(), + [['2@result', { colnames: ['id', 'name'], cols: [1, 1], rows: [1, 1] }, { colnames: DomainMatchingType.Overapproximation }]] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6) +result <- df[,] + `.trim(), + [ + ['1@df', { colnames: ['id','name'], cols: [2, 2], rows: [3, 3] }], + ['2@result', { colnames: ['id','name'], cols: [2, 2], rows: [3, 3] }], + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3) +df$id <- "A" +print(df) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }], + ['2@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }], + ['3@df', { colnames: ['id'], cols: [1, 2], rows: [3, 3] }, { cols: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3) +df$name <- "A" +print(df) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }], + ['3@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 3] }, { cols: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3) +df[["name"]] <- "A" +print(df) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }], + ['3@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 3] }, { cols: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3) +df[1] <- "A" +print(df) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }], + ['3@df', { colnames: ColNamesTop, cols: [1, 1], rows: [3, 3] }, { colnames: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3) +df[2] <- "A" +print(df) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }], + ['3@df', { colnames: ColNamesTop, cols: [2, 2], rows: [3, 3] }, { colnames: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3) +df[, "name"] <- "A" +print(df) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }], + ['3@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 3] }, { cols: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3) +df[4, ] <- 4 +print(df) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }], + ['3@df', { colnames: ['id'], cols: [1, 1], rows: [4, 4] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3) +df[4, 1] <- 4 +print(df) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }], + ['3@df', { colnames: ColNamesTop, cols: [1, 1], rows: [4, 4] }, { colnames: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(1:5, 6:10) +colnames(df) <- c("id", "name") +print(df) + `.trim(), + [ + ['1@df', { colnames: ColNamesTop, cols: [2, 2], rows: [5, 5] }, { colnames: DomainMatchingType.Overapproximation }], + ['3@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [5, 5] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:5, name = 6:10) +colnames(df) <- runif(2) +print(df) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [5, 5] }], + ['3@df', { colnames: ColNamesTop, cols: [2, 2], rows: [5, 5] }, { colnames: DomainMatchingType.Overapproximation }] + ] + ); + testDataFrameDomain( ` df <- data.frame(id = 1:5) @@ -260,4 +506,428 @@ df <- rbind(df, list(id = 6:10)) ['2@df', { colnames: ['id'], cols: [1, 1], rows: IntervalTop }, { rows: DomainMatchingType.Overapproximation }] ] ); + + testDataFrameDomain( + ` +df <- if (runif(1) >= 0.5) data.frame(id = 1:3) else data.frame(id = 1:5, name = 6:10) +df <- head(df, n = 3) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 5] }, DataFrameTestOverapproximation], + ['2@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 3] }, { colnames: DomainMatchingType.Overapproximation, cols: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- if (runif(1) >= 0.5) data.frame(id = 1:3) else data.frame(id = 1:5, name = 6:10) +df <- head(df, c(2, 1)) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 5] }, DataFrameTestOverapproximation], + ['2@df', { colnames: ['id', 'name'], cols: [1, 1], rows: [2, 2] }, { colnames: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- if (runif(1) >= 0.5) data.frame(id = 1:3) else data.frame(id = 1:5, name = 6:10) +df <- head(df, -2) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 5] }, DataFrameTestOverapproximation], + ['2@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [1, 3] }, DataFrameTestOverapproximation] + ] + ); + + testDataFrameDomain( + ` +df <- if (runif(1) >= 0.5) data.frame(id = 1:3) else data.frame(id = 1:5, name = 6:10) +df <- head(df, n = -c(2, 1)) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 5] }, DataFrameTestOverapproximation], + ['2@df', { colnames: ['id', 'name'], cols: [0, 1], rows: [1, 3] }, DataFrameTestOverapproximation] + ] + ); + + testDataFrameDomain( + ` +df <- if (runif(1) >= 0.5) data.frame(id = 1:3) else data.frame(id = 1:5, name = 6:10) +df <- tail(df, n = 3) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 5] }, DataFrameTestOverapproximation], + ['2@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 3] }, { colnames: DomainMatchingType.Overapproximation, cols: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- if (runif(1) >= 0.5) data.frame(id = 1:3) else data.frame(id = 1:5, name = 6:10) +df <- tail(df, c(2, 1)) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 5] }, DataFrameTestOverapproximation], + ['2@df', { colnames: ['id', 'name'], cols: [1, 1], rows: [2, 2] }, { colnames: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- if (runif(1) >= 0.5) data.frame(id = 1:3) else data.frame(id = 1:5, name = 6:10) +df <- tail(df, -2) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 5] }, DataFrameTestOverapproximation], + ['2@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [1, 3] }, DataFrameTestOverapproximation] + ] + ); + + testDataFrameDomain( + ` +df <- if (runif(1) >= 0.5) data.frame(id = 1:3) else data.frame(id = 1:5, name = 6:10) +df <- tail(df, n = -c(2, 1)) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [3, 5] }, DataFrameTestOverapproximation], + ['2@df', { colnames: ['id', 'name'], cols: [0, 1], rows: [1, 3] }, DataFrameTestOverapproximation] + ] + ); + + describe.skipIf(skipDplyr)('dplyr Functions', () => { + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6) +df <- dplyr::filter(df, TRUE) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [3, 3] }], + ['2@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [3, 3] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6) +df <- dplyr::filter(df, FALSE) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [3, 3] }], + ['2@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [0, 0] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6) +df <- dplyr::filter(df, id == 2) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [3, 3] }], + ['2@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [0, 3] }, { rows: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6, label = "A") +df <- dplyr::select(df, id, name) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [3, 3] }], + ['2@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [3, 3] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6, label = "A") +df <- dplyr::select(df, -name) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [3, 3] }], + ['2@df', { colnames: ['id', 'label'], cols: [2, 2], rows: [3, 3] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6, label = "A") +df <- dplyr::select(df, -name, -label) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [3, 3] }], + ['2@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6, label = "A") +df <- dplyr::select(df, id, -name) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [3, 3] }], + ['2@df', { colnames: ['id'], cols: [1, 1], rows: [3, 3] }] + ] + ); + }); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6, label = "A") +df <- subset(df, TRUE, select = c(id, name)) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [3, 3] }], + ['2@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [3, 3] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6, label = "A") +df <- subset(df, FALSE, id) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [3, 3] }], + ['2@df', { colnames: ['id'], cols: [1, 1], rows: [0, 0] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6, label = "A") +df <- subset(df, id == 2, -label) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [3, 3] }], + ['2@df', { colnames: ['id', 'name'], cols: [2, 2], rows: [0, 3] }, { rows: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6, label = "A") +df <- subset(df, id > 1, select = c(-name, -label)) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [3, 3] }], + ['2@df', { colnames: ['id'], cols: [1, 1], rows: [0, 3] }, { rows: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, name = 4:6, label = "A") +df <- subset(df, select = c(-id, -name)) + `.trim(), + [ + ['1@df', { colnames: ['id', 'name', 'label'], cols: [3, 3], rows: [3, 3] }], + ['2@df', { colnames: ['label'], cols: [1, 1], rows: [3, 3] }] + ] + ); + + describe.skipIf(skipDplyr)('dplyr Functions', () => { + testDataFrameDomain( + ` +df <- data.frame(id = 1:5) +df <- dplyr::mutate(df, id = c(letters[1:5])) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df', { colnames: ['id'], cols: [1, 2], rows: [5, 5] }, { cols: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:5) +df <- dplyr::mutate(df, name = c(letters[1:5])) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [5, 5] }, { cols: DomainMatchingType.Overapproximation }] + ] + ); + }); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:5) +df <- transform(df, id = c(letters[1:5])) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df', { colnames: ['id'], cols: [1, 2], rows: [5, 5] }, { cols: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:5) +df <- transform(df, name = c(letters[1:5])) + `.trim(), + [ + ['1@df', { colnames: ['id'], cols: [1, 1], rows: [5, 5] }], + ['2@df', { colnames: ['id', 'name'], cols: [1, 2], rows: [5, 5] }, { cols: DomainMatchingType.Overapproximation }] + ] + ); + + describe.skipIf(skipDplyr)('dplyr Functions', () => { + testDataFrameDomain( + ` +df <- data.frame(id = 1:5, score = c(80, 75, 90, 70, 85)) +df <- dplyr::group_by(df, id) |> as.data.frame() + `.trim(), + [ + ['1@df', { colnames: ['id', 'score'], cols: [2, 2], rows: [5, 5] }], + ['2@df', { colnames: ['id', 'score'], cols: [2, 2], rows: [0, 5] }, { rows: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +library(dplyr) +df <- data.frame(id = 1:6, category = c("A", "B", "B", "A", "C", "B"), score = c(80, 75, 90, 70, 85, 82)) +df <- df |> + group_by(category) |> + summarise(score = mean(score), sum = sum(score)) |> + as.data.frame() +print(df) + `.trim(), + [ + ['2@df', { colnames: ['id', 'category', 'score'], cols: [3, 3], rows: [6, 6] }], + ['7@df', { colnames: ['id', 'category', 'score', 'sum'], cols: [2, 5], rows: [0, 6] }, DataFrameTestOverapproximation] + ] + ); + + testDataFrameDomain( + ` +library(dplyr) +df <- data.frame(id = 1:6, category = c("A", "B", "B", "A", "C", "B"), score = c(80, 75, 90, 70, 85, 82)) +df <- df |> + summarise(score = mean(score), sum = sum(score)) |> + as.data.frame() +print(df) + `.trim(), + [ + ['2@df', { colnames: ['id', 'category', 'score'], cols: [3, 3], rows: [6, 6] }], + ['6@df', { colnames: ['id', 'category', 'score', 'sum'], cols: [2, 5], rows: [1, 1] }, { colnames: DomainMatchingType.Overapproximation, cols: DomainMatchingType.Overapproximation }] + ] + ); + + testDataFrameDomain( + ` +df1 <- data.frame(id = 1:4, score = c(80, 75, 90, 70)) +df2 <- data.frame(id = 1:6, category = c("A", "B", "B", "A", "C", "B")) +df <- dplyr::left_join(df1, df2, by = "id") + `.trim(), + [ + ['1@df1', { colnames: ['id', 'score'], cols: [2, 2], rows: [4, 4] }], + ['2@df2', { colnames: ['id', 'category'], cols: [2, 2], rows: [6, 6] }], + ['3@df', { colnames: ['id', 'score', 'category'], cols: [3, 3], rows: [4, 4] }] + ] + ); + + testDataFrameDomain( + ` +df1 <- data.frame(id = 1:6, category = c("A", "B", "B", "A", "C", "B")) +df2 <- data.frame(id = 1:4, score = c(80, 75, 90, 70)) +df <- dplyr::left_join(df1, df2, by = "id") + `.trim(), + [ + ['1@df1', { colnames: ['id', 'category'], cols: [2, 2], rows: [6, 6] }], + ['2@df2', { colnames: ['id', 'score'], cols: [2, 2], rows: [4, 4] }], + ['3@df', { colnames: ['id', 'category', 'score'], cols: [3, 3], rows: [6, 6] }] + ] + ); + }); + + testDataFrameDomain( + ` +df1 <- data.frame(id = 1:4, score = c(80, 75, 90, 70)) +df2 <- data.frame(id = 1:6, category = c("A", "B", "B", "A", "C", "B")) +df <- merge(df1, df2, by = "id") + `.trim(), + [ + ['1@df1', { colnames: ['id', 'score'], cols: [2, 2], rows: [4, 4] }], + ['2@df2', { colnames: ['id', 'category'], cols: [2, 2], rows: [6, 6] }], + ['3@df', { colnames: ['id', 'score', 'category'], cols: [3, 3], rows: [4, 4] }] + ] + ); + + testDataFrameDomain( + ` +df1 <- data.frame(id = 1:6, category = c("A", "B", "B", "A", "C", "B")) +df2 <- data.frame(id = 1:4, score = c(80, 75, 90, 70)) +df <- merge(df1, df2, by = "id") + `.trim(), + [ + ['1@df1', { colnames: ['id', 'category'], cols: [2, 2], rows: [6, 6] }], + ['2@df2', { colnames: ['id', 'score'], cols: [2, 2], rows: [4, 4] }], + ['3@df', { colnames: ['id', 'category', 'score'], cols: [3, 3], rows: [4, 4] }] + ] + ); + + describe.skipIf(skipDplyr)('dplyr Functions', () => { + testDataFrameDomain( + ` +df <- data.frame(id = 1:5, category = c("A", "B", "A", "C", "B"), score = c(80, 75, 90, 70, 85)) +df <- dplyr::relocate(df, score, .before = category) + `.trim(), + [ + ['1@df', { colnames: ['id', 'category', 'score'], cols: [3, 3], rows: [5, 5] }], + ['2@df', { colnames: ['id', 'category', 'score'], cols: [3, 3], rows: [5, 5] }] + ] + ); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:5, category = c("A", "B", "A", "C", "B"), score = c(80, 75, 90, 70, 85)) +df <- dplyr::arrange(df, -score, id) + `.trim(), + [ + ['1@df', { colnames: ['id', 'category', 'score'], cols: [3, 3], rows: [5, 5] }], + ['2@df', { colnames: ['id', 'category', 'score'], cols: [3, 3], rows: [5, 5] }] + ] + ); + + testDataFrameDomain( + ` +library(dplyr) + +df1 <- data.frame(id = 1:5, age = c(25, 32, 35, 40, 45), score = c(90, 85, 88, 92, 95)) +df2 <- data.frame(id = c(1, 2, 3, 5, 6, 7), category = c("A", "B", "A", "A", "B", "B")) +df3 <- df1 %>% + filter(age > 30) %>% + mutate(level = score^2) %>% + left_join(df2, by = "id") %>% + select(-age) + +print(df3$level) + `.trim(), + [ + ['3@df1', { colnames: ['id', 'age', 'score'], cols: [3, 3], rows: [5, 5] }], + ['4@df2', { colnames: ['id', 'category'], cols: [2, 2], rows: [6, 6] }], + ['11@df3', { colnames: ['id', 'score', 'level', 'category'], cols: [3, 4], rows: [0, 5] }, { cols: DomainMatchingType.Overapproximation, rows: DomainMatchingType.Overapproximation }] + ] + ); + }); + + testDataFrameDomain( + ` +df <- data.frame(id = 1:3, age = c(25, 30, 40)) +df <- df |> subset(age < 30) +df <- df |> rbind(c(4, 32), c(5, 35)) +df <- df[2:3, 1:2] + `.trim(), + [ + ['1@df', { colnames: ['id', 'age'], cols: [2, 2], rows: [3, 3] }], + ['2@df', { colnames: ['id', 'age'], cols: [2, 2], rows: [0, 3] }, { rows: DomainMatchingType.Overapproximation }], + ['3@df', { colnames: ['id', 'age'], cols: [2, 2], rows: [2, 5] }, { rows: DomainMatchingType.Overapproximation }], + ['4@df', { colnames: ['id', 'age'], cols: [2, 2], rows: [2, 2] }], + ] + ); }));