udfs/community/cw_disjoint_all_partitions_by_regexp.sqlx

config { hasOutput: true }
/*
 * Copyright 2024 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

CREATE OR REPLACE FUNCTION ${self()}(haystack STRING, regex STRING)
  RETURNS ARRAY<STRUCT<p INT64, rn INT64>>
  OPTIONS(description="""Returns disjoint matches for the regex with the sequence of rows encoded in
 special format. The expected input haystack format is
 "row-1@row-number-1#row-2@row-number-2# ... #row-n@row-number-n#".
 If the regex matches then, it returns all the rows matched by the given pattern
 along with the match group to which the given row belongs, e.g. if haystack is
 "A@1#A@2#B@3#A@4#B@5#" and regex is "(?:A@\\d+#)+(?:B@\\d+#)" then it will
 return  [(0, 1), (0, 2), (0, 3), (1, 4), (1, 5)] since there are two matched
 subsequence - "A@1#A@2#B@3#" and "A@4#B@5#" forming two partitions. (1, 2, 3)
 belong to partition 0 and (4, 5) belongs to partition 1.

 Assuming regex "(?:A@\\d+#)+(?:B@\\d+#)", for each of the haystacks below we
 will get corresponding outputs as:
 haystack                output
 "A@1#A@2#B@3#A@4#B@5#"  [(0, 1), (0, 2), (0, 3), (1, 4), (1, 5)]
 "A@1#B@2#B@3#A@4#B@5#"  [(0, 1), (0, 2), (1, 4), (1, 5)]
""")
  AS (
    (WITH t AS (
        SELECT
          p, split(m, '#') ms
        FROM UNNEST(REGEXP_EXTRACT_ALL(haystack, regex)) m WITH OFFSET p
      )
      SELECT
        ARRAY_AGG(STRUCT(p, CAST(split(ms, '@')[SAFE_OFFSET(1)] AS INT64) AS rn))
      FROM t CROSS JOIN UNNEST(t.ms) ms WHERE ms <> '')
  );