regexp_instr(col, pattern, pos, occurrence)
Описание
Функция regexp_instr()
возвращает позицию вхождения подстроки, соответствующей регулярному выражению, начиная с указанной позиции и с указанным номером вхождения.
Параметры
col
: Column - столбец со строками для поискаpattern
: String - регулярное выражение для поискаpos
: Integer (опционально) - начальная позиция поиска (по умолчанию 1)occurrence
: Integer (опционально) - номер вхождения для поиска (по умолчанию 1)
Возвращаемое значение
Integer - позиция найденного вхождения или 0, если совпадение не найдено
Пример использования
from pyspark.sql.functions import regexp_instr
from pyspark.sql import SparkSession
# Создаем SparkSession
spark = SparkSession.builder.appName("regexp_instr_example").getOrCreate()
# Создаем DataFrame
data = [
("hello world hello"),
("spark is awesome"),
("hello hello hello")
]
df = spark.createDataFrame(data, ["text"])
# Находим позицию второго вхождения слова "hello"
result = df.select(
"text",
regexp_instr("text", "hello", 1, 2).alias("second_hello_pos")
).show()
# Результат:
# +------------------+----------------+
# |text |second_hello_pos|
# +------------------+----------------+
# |hello world hello |13 |
# |spark is awesome |0 |
# |hello hello hello |7 |
# +------------------+----------------+
Примечания
- Функция чувствительна к регистру
- NULL значения возвращают NULL
- Если совпадение не найдено, возвращается 0
- Для извлечения подстроки используйте
regexp_substr()
- Для проверки соответствия регулярному выражению используйте
regexp_like()
XML функции
aggregate functions
- any_value(col[, ignoreNulls])
- approxCountDistinct(col[, rsd])
- approx_percentile(col, percentage[, accuracy])
- array_agg(col)
- avg(col)
- bit_and(col)
- bit_or(col)
- bit_xor(col)
- bool_and(col)
- bool_or(col)
- bool_xor(col)
- collect_list(col)
- collect_set(col)
- corr(col1, col2)
- count(col)
- countDistinct(col, *cols)
- count_distinct(col, *cols)
- count_if(col)
- count_min_sketch(col, eps, confidence, seed)
- covar_pop(col1, col2)
- covar_samp(col1, col2)
- every(col)
- first(col[, ignorenulls])
- first_value(col[, ignoreNulls])
- grouping(col)
- grouping_id(*cols)
- histogram_numeric(col, nBins)
- hll_sketch_agg(col[, lgConfigK])
- hll_union_agg(col[, allowDifferentLgConfigK])
- kurtosis(col)
- last(col[, ignorenulls])
- last_value(col[, ignoreNulls])
- max(col)
- max_by(col, ord)
- mean(col)
- median(col)
- min(col)
- min_by(col, ord)
- mode(col)
- percentile(col, percentage)
- percentile_approx(col, percentage, accuracy=10000)
- product(col)
- regr_avgx(y, x)
- regr_avgy(y, x)
- regr_count()
- regr_intercept(y, x)
- regr_r2()
- regr_slope(y, x)
- regr_syy()
- skewness(col)
- some(col)
- stddev(col)
- stddev_pop(col)
- stddev_samp(col)
- sum(col)
- sum_distinct(col)
- var_pop(col)
- var_samp(col)
- variance(col)
array functions pyspark
- array(*cols)
- array_append(col, value)
- array_compact(col)
- array_contains(col, value)
- array_distinct(col)
- array_insert(arr, pos, value)
- array_intersect(col1, col2)
- array_join(col, delimiter[, null_replacement])
- array_max(col)
- array_min(col)
- array_position(col, value)
- array_prepend(col, value)
- array_remove(col, element)
- array_repeat(col, count)
- array_size(col)
- array_sort(col[, comparator])
- array_union(col1, col2)
- arrays_overlap(a1, a2)
- arrays_zip(*cols)
- cardinality(column)
- concat(*cols)
- element_at(col, extraction)
- exists(col, f)
- explode(column)
- explode_outer(column)
- flatten(col)
- forall(col, f)
- inline(column)
- inline_outer(column)
- json_array_length(json_array)
- json_object_keys(json_object)
- map_filter(map, f)
- map_zip_with(map1, map2, function)
- posexplode(column)
- posexplode_outer(column)
- reduce(col, initialValue, merge, finish=None)
- reverse(col)
- sequence(start, stop[, step])
- shuffle(col)
- size(col)
- slice(x, start, length)
- sort_array(column, asc=True)
- stack(n, expr1, ..., exprN)
- struct(*cols)
- transform(col, f)
- transform_keys(map, f)
- transform_values(map, f)
- try_element_at()
- zip_with(left, right, f)
basic functions
bitmap functions
- bitmap_construct_agg(col)
- bitmap_count(bitmap)
- bitmap_or_agg(bitmap)
- bitwise_not(col)
- bitwise_or(col1, col2)
- bitwise_xor(col1, col2)
bitwise functions
- bit_count()
- bit_get()
- bit_set(column, pos)
- bitwiseAND(column1, column2)
- bitwiseNOT(column)
- bitwiseOR(column1, column2)
- bitwiseXOR(column1, column2)
- getbit()
- setbit(column, pos)
- shiftleft(column, numBits)
- shiftright(column, numBits)
- shiftrightunsigned(column, numBits)
call functions
comparison functions
conditional functions
conversion functions
csv functions
datetime functions pyspark
- add_months(start_date, num_months)
- curdate()
- current_date()
- current_time()
- current_timezone()
- date_add()
- date_diff(endDate, startDate)
- date_part(field, source)
- date_sub()
- date_trunc()
- dateadd(unit, value, date)
- datediff(end_date, start_date)
- datepart(field, source)
- day(date)
- dayofmonth(col)
- dayofweek(date)
- dayofyear(date)
- extract(field FROM source)
- from_utc_timestamp(timestamp, timezone)
- hour(timestamp)
- last_day(date)
- localtimestamp()
- make_date(year, month, day)
- make_dt_interval(days, hours, minutes, seconds)
- make_timestamp(year, month, day, hour, min, sec)
- make_timestamp_ltz(year, month, day, hour, minute, second)
- make_ym_interval(years, months)
- minute(timestamp)
- month(date)
- months_between(end_date, start_date)
- next_day(date, day_of_week)
- now()
- quarter(date)
- second(timestamp)
- session_window(timeColumn, gapDuration)
- timestamp_micros(microseconds)
- timestamp_millis(milliseconds)
- timestamp_seconds(seconds)
- timestampadd(unit, interval, timestamp)
- to_date(col, format=None)
- to_timestamp(col, format=None)
- to_timestamp_ltz(col, format=None)
- to_timestamp_ntz(col, format=None)
- to_unix_timestamp(col, format=None)
- to_utc_timestamp(timestamp, timezone)
- trunc(date, format)
- try_to_timestamp(col, format=None)
- unix_date(col)
- unix_micros(col)
- unix_millis(col)
- unix_seconds(col)
- unix_timestamp(timestamp=None, format=None)
- weekday(date)
- weekofyear(date)
- window_time(col)
- year(date)
- years_between(end, start)
encryption functions
expression functions
hash functions
java functions
json functions
- from_json(col, schema, options={})
- get_json_object()
- json_tuple(col, *fields)
- to_json(col, options={})
map functions
- create_map(*cols)
- map_concat(map1, map2, ...)
- map_contains_key(col, value)
- map_entries(map)
- map_from_arrays(keys, values)
- map_from_entries(array)
- map_keys(map)
- map_values(map)
- str_to_map(text[, pairDelim, keyValueDelim])
mathematical functions pyspark
- abs(col)
- asin(col)
- atan(col)
- atan2(y, x)
- bin(col)
- bround(col, scale=0)
- cbrt(col)
- ceil(col)
- ceiling(col)
- conv(num, from_base, to_base)
- cos(col)
- cosh(col)
- cot(col)
- csc(col)
- e()
- exp()
- expm1()
- factorial()
- floor(col)
- hypot(col1, col2)
- log(col)
- log10(col)
- negative(col)
- pmod(dividend, divisor)
- positive(col)
- power(col1, col2)
- rand(seed=None)
- randn(seed=None)
- rint(col)
- round(col, scale=0)
- sec(col)
- sign(col)
- signum(col)
- tan(col)
- tanh(col)
- toDegrees(col)
- try_add()
- try_avg()
- try_divide()
- try_multiply()
- try_subtract()
- try_sum()
- try_to_number()
normal functions
null functions
- coalesce(expr1, expr2, ...)
- equal_null(expr1, expr2)
- ifnull(expr1, expr2)
- isnotnull(expr)
- isnull(expr)
- nanvl(col1, col2)
- nullif(expr1, expr2)
- nvl(expr1, expr2)
- nvl2(expr1, expr2, expr3)
sketch functions
sort functions
- asc(col)
- asc_nulls_first(col)
- asc_nulls_last(col)
- desc(col)
- desc_nulls_first(col)
- desc_nulls_last(col)
string functions pyspark
- char_length(col)
- eval(expr)
- left(col, len)
- levenshtein(col1, col2)
- mask(col, upperChar, lowerChar, digitChar, otherChar)
- regexp(col, pattern)
- regexp_extract_all(col, pattern)
- regexp_like(col, pattern)
- regexp_substr(col, pattern, pos, occurrence)
- repeat(col, n)
- replace(col, search, replace)
- right(col, len)
- rlike(col, pattern)
- rpad(col, len, pad)
- rtrim(col)
- sentences(str, lang, country)
- soundex(col)
- split(str, pattern)
- split_part(str, delimiter, partNum)
- sql(query)
- startswith(col, prefix)
- substr(col, pos, len)
- substring_index(col, delim, count)
- to_varchar(col)
- translate(col, matchingString, replaceString)
- trim(col)
- ucase(col)
- unbase64(col)
- unhex(col)
- upper(col)
- url_decode(str)
- url_encode(str)
- uuid()
struct functions
system functions
- current_catalog()
- current_database()
- current_schema()
- current_user()
- input_file_block_length()
- input_file_block_start()
- input_file_name()
- spark_partition_id()
- user()
type functions
window functions
- cume_dist()
- dense_rank()
- lag(col, offset, default)
- lead(col, offset, default)
- monotonically_increasing_id()
- nth_value(col, n)
- ntile(n)
- percent_rank()
- rank()
- row_number()
- window(timeColumn, windowDuration, slideDuration, startTime)
xml functions
- xml_tuple(xml, *paths)
- xpath(xml, path)
- xpath_boolean(xml, path)
- xpath_double(xml, path)
- xpath_float(xml, path)
- xpath_int(xml, path)
- xpath_long(xml, path)
- xpath_number(xml, path)
- xpath_short(xml, path)
- xpath_string(xml, path)
агрегатные функции
битовые операции
оконные функции
- diff(col)
- session_window(timeColumn, gapDuration)
- sliding_window(timeColumn, windowDuration, slideDuration)
- tumbling_window(timeColumn, windowDuration)