Skip to content

Commit 03d0481

Browse files
authored
Merge pull request #1281 from ragusaa/CLAM-2586-SaveUrlsHTML
Clam 2586 save urls html
2 parents 8ae19ec + 666e047 commit 03d0481

File tree

13 files changed

+669
-30
lines changed

13 files changed

+669
-30
lines changed

clamscan/clamscan.c

+2
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,8 @@ void help(void)
254254
mprintf(LOGG_INFO, " --gen-json[=yes/no(*)] Generate JSON metadata for the scanned file(s). For testing & development use ONLY.\n");
255255
mprintf(LOGG_INFO, " JSON will be printed if --debug is enabled.\n");
256256
mprintf(LOGG_INFO, " A JSON file will dropped to the temp directory if --leave-temps is enabled.\n");
257+
mprintf(LOGG_INFO, " --json-store-html-urls[=yes(*)/no] Store html URLs in metadata.\n");
258+
mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'HTMLUrls'\n");
257259
mprintf(LOGG_INFO, " --database=FILE/DIR -d FILE/DIR Load virus database from FILE or load all supported db files from DIR\n");
258260
mprintf(LOGG_INFO, " --official-db-only[=yes/no(*)] Only load official signatures\n");
259261
mprintf(LOGG_INFO, " --fail-if-cvd-older-than=days Return with a nonzero error code if virus database outdated.\n");

clamscan/manager.c

+4
Original file line numberDiff line numberDiff line change
@@ -1574,6 +1574,10 @@ int scanmanager(const struct optstruct *opts)
15741574
options.general |= CL_SCAN_GENERAL_HEURISTICS;
15751575
}
15761576

1577+
if (optget(opts, "json-store-html-urls")->enabled) {
1578+
options.general |= CL_SCAN_GENERAL_STORE_HTML_URLS;
1579+
}
1580+
15771581
/* TODO: Remove deprecated option in a future feature release */
15781582
if ((optget(opts, "block-max")->enabled) ||
15791583
(optget(opts, "alert-exceeds-max")->enabled)) {

common/optparser.c

+1
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ const struct clam_option __clam_options[] = {
389389
{"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"},
390390

391391
{"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"},
392+
{"JsonStoreHTMLUrls", "json-store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML <form and <a tags.", "yes"},
392393

393394
{"HeuristicScanPrecedence", "heuristic-scan-precedence", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Allow heuristic match to take precedence.\nWhen enabled, if a heuristic scan (such as phishingScan) detects\na possible virus/phish it will stop scan immediately. Recommended, saves CPU\nscan-time.\nWhen disabled, virus/phish detected by heuristic scans will be reported only\nat the end of a scan. If an archive contains both a heuristically detected\nvirus/phish, and a real malware, the real malware will be reported.\nKeep this disabled if you intend to handle \"Heuristics.*\" viruses\ndifferently from \"real\" malware.\nIf a non-heuristically-detected virus (signature-based) is found first,\nthe scan is interrupted immediately, regardless of this config option.", "yes"},
394395

etc/clamd.conf.sample

+6
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,12 @@ Example
254254
# Default: no
255255
#GenerateMetadataJson yes
256256

257+
# Store URLs found in html files to the json metadata.
258+
# URLs will be stored in an array with the tag 'HTMLUrls'
259+
# GenerateMetadataJson is required for this feature.
260+
# Default: yes (if GenerateMetadataJson is used)
261+
#JsonStoreHTMLUrls no
262+
257263
# Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject
258264
# any ALLMATCHSCAN command as invalid.
259265
# Default: yes

libclamav/clamav.h

+1
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ struct cl_scan_options {
168168
#define CL_SCAN_GENERAL_HEURISTICS 0x4 /* option to enable heuristic alerts */
169169
#define CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE 0x8 /* allow heuristic match to take precedence. */
170170
#define CL_SCAN_GENERAL_UNPRIVILEGED 0x10 /* scanner will not have read access to files. */
171+
#define CL_SCAN_GENERAL_STORE_HTML_URLS 0x20 /* Store urls found in html <a and <form tags when recording JSON metadata */
171172

172173
/* parsing capabilities options */
173174
#define CL_SCAN_PARSE_ARCHIVE 0x1

libclamav/hashtab.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -719,9 +719,9 @@ void cli_hashset_destroy(struct cli_hashset *hs)
719719
hs->capacity = 0;
720720
}
721721

722-
#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val) & 0x1f)))
723-
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val) & 0x1f)))
724-
#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val) & 0x1f)))
722+
#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val)&0x1f)))
723+
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val)&0x1f)))
724+
#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val)&0x1f)))
725725

726726
/*
727727
* searches the hashset for the @key.

libclamav/htmlnorm.c

+100-26
Original file line numberDiff line numberDiff line change
@@ -370,51 +370,70 @@ void html_tag_arg_add(tag_arguments_t *tags,
370370
const char *tag, char *value)
371371
{
372372
int len, i;
373-
tags->count++;
374-
tags->tag = (unsigned char **)cli_max_realloc_or_free(tags->tag,
375-
tags->count * sizeof(char *));
376-
if (!tags->tag) {
373+
int tagCnt = tags->count;
374+
int valueCnt = tags->count;
375+
int contentCnt = 0;
376+
unsigned char **tmp = NULL;
377+
378+
tmp = (unsigned char **)cli_max_realloc(tags->tag, (tagCnt + 1) * sizeof(char *));
379+
if (!tmp) {
377380
goto done;
378381
}
379-
tags->value = (unsigned char **)cli_max_realloc_or_free(tags->value,
380-
tags->count * sizeof(char *));
381-
if (!tags->value) {
382+
tags->tag = tmp;
383+
tagCnt++;
384+
385+
tmp = (unsigned char **)cli_max_realloc(tags->value, (valueCnt + 1) * sizeof(char *));
386+
if (!tmp) {
382387
goto done;
383388
}
389+
tags->value = tmp;
390+
valueCnt++;
391+
384392
if (tags->scanContents) {
385-
tags->contents = (unsigned char **)cli_max_realloc_or_free(tags->contents,
386-
tags->count * sizeof(*tags->contents));
387-
if (!tags->contents) {
393+
contentCnt = tags->count;
394+
tmp = (unsigned char **)cli_max_realloc(tags->contents, (contentCnt + 1) * sizeof(*tags->contents));
395+
if (!tmp) {
388396
goto done;
389397
}
390-
tags->contents[tags->count - 1] = NULL;
398+
tags->contents = tmp;
399+
tags->contents[contentCnt] = NULL;
400+
contentCnt++;
391401
}
392-
tags->tag[tags->count - 1] = (unsigned char *)cli_safer_strdup(tag);
402+
403+
tags->tag[tags->count] = (unsigned char *)cli_safer_strdup(tag);
393404
if (value) {
394405
if (*value == '"') {
395-
tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value + 1);
396-
len = strlen((const char *)value + 1);
406+
tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value + 1);
407+
if (NULL == tags->value[tags->count]) {
408+
goto done;
409+
}
410+
len = strlen((const char *)value + 1);
397411
if (len > 0) {
398-
tags->value[tags->count - 1][len - 1] = '\0';
412+
tags->value[tags->count][len - 1] = '\0';
399413
}
400414
} else {
401-
tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value);
415+
tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value);
402416
}
403417
} else {
404-
tags->value[tags->count - 1] = NULL;
418+
tags->value[tags->count] = NULL;
405419
}
420+
421+
tags->count++;
406422
return;
407423

408424
done:
409425
/* Bad error - can't do 100% recovery */
410-
tags->count--;
411-
for (i = 0; i < tags->count; i++) {
426+
for (i = 0; i < tagCnt; i++) {
412427
if (tags->tag) {
413428
free(tags->tag[i]);
414429
}
430+
}
431+
for (i = 0; i < valueCnt; i++) {
415432
if (tags->value) {
416433
free(tags->value[i]);
417434
}
435+
}
436+
for (i = 0; i < contentCnt; i++) {
418437
if (tags->contents) {
419438
if (tags->contents[i])
420439
free(tags->contents[i]);
@@ -649,7 +668,46 @@ static void js_process(struct parser_state *js_state, const unsigned char *js_be
649668
}
650669
}
651670

652-
static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
671+
bool html_insert_form_data(const char *const value, form_data_t *tags)
672+
{
673+
bool bRet = false;
674+
size_t cnt = tags->count + 1;
675+
char **tmp = NULL;
676+
677+
/*
678+
* Do NOT use cli_max_realloc_or_free because all the previously malloc'd tag
679+
* values will be leaked when tag is free'd in the case where realloc fails.
680+
*/
681+
tmp = cli_max_realloc(tags->urls, cnt * sizeof(unsigned char *));
682+
if (!tmp) {
683+
goto done;
684+
}
685+
tags->urls = tmp;
686+
687+
tags->urls[tags->count] = cli_safer_strdup(value);
688+
if (tags->urls[tags->count]) {
689+
tags->count = cnt;
690+
}
691+
692+
bRet = true;
693+
done:
694+
if (!bRet) {
695+
memset(tags, 0, sizeof(*tags));
696+
}
697+
698+
return bRet;
699+
}
700+
701+
void html_form_data_tag_free(form_data_t *tags)
702+
{
703+
size_t i;
704+
for (i = 0; i < tags->count; i++) {
705+
CLI_FREE_AND_SET_NULL(tags->urls[i]);
706+
}
707+
CLI_FREE_AND_SET_NULL(tags->urls);
708+
}
709+
710+
static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
653711
{
654712
int fd_tmp, tag_length = 0, tag_arg_length = 0;
655713
bool binary, retval = false, escape = false, hex = false;
@@ -659,7 +717,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
659717
FILE *stream_in = NULL;
660718
html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE;
661719
char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1];
662-
char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value;
720+
char tag_val[HTML_STR_LENGTH + 1], *tmp_file = NULL, *arg_value = NULL;
663721
unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;
664722
tag_arguments_t tag_args;
665723
quoted_state quoted = NOT_QUOTED;
@@ -1224,8 +1282,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
12241282
href_contents_begin = ptr;
12251283
}
12261284
if (strcmp(tag, "/form") == 0) {
1227-
if (in_form_action)
1285+
if (in_form_action) {
12281286
free(in_form_action);
1287+
}
12291288
in_form_action = NULL;
12301289
}
12311290
} else if (strcmp(tag, "script") == 0) {
@@ -1310,9 +1369,13 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
13101369
} else if (strcmp(tag, "form") == 0 && hrefs->scanContents) {
13111370
const char *arg_action_value = html_tag_arg_value(&tag_args, "action");
13121371
if (arg_action_value) {
1313-
if (in_form_action)
1372+
if (in_form_action) {
13141373
free(in_form_action);
1374+
}
13151375
in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value);
1376+
if (form_data) {
1377+
html_insert_form_data((const char *const)in_form_action, form_data);
1378+
}
13161379
}
13171380
} else if (strcmp(tag, "img") == 0) {
13181381
arg_value = html_tag_arg_value(&tag_args, "src");
@@ -1917,8 +1980,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
19171980
done:
19181981
if (line) /* only needed for done case */
19191982
free(line);
1920-
if (in_form_action)
1983+
if (in_form_action) {
19211984
free(in_form_action);
1985+
}
19221986
if (in_ahref) /* tag not closed, force closing */
19231987
html_tag_contents_done(hrefs, in_ahref, &contents);
19241988

@@ -1960,6 +2024,11 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
19602024
}
19612025

19622026
bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
2027+
{
2028+
return html_normalise_mem_form_data(ctx, in_buff, in_size, dirname, hrefs, dconf, NULL);
2029+
}
2030+
2031+
bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
19632032
{
19642033
m_area_t m_area;
19652034

@@ -1968,18 +2037,23 @@ bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, con
19682037
m_area.offset = 0;
19692038
m_area.map = NULL;
19702039

1971-
return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
2040+
return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
19722041
}
19732042

19742043
bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
2044+
{
2045+
return html_normalise_map_form_data(ctx, map, dirname, hrefs, dconf, NULL);
2046+
}
2047+
2048+
bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
19752049
{
19762050
bool retval = false;
19772051
m_area_t m_area;
19782052

19792053
m_area.length = map->len;
19802054
m_area.offset = 0;
19812055
m_area.map = map;
1982-
retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
2056+
retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
19832057
return retval;
19842058
}
19852059

libclamav/htmlnorm.h

+9
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,19 @@ typedef struct m_area_tag {
4545
fmap_t *map;
4646
} m_area_t;
4747

48+
typedef struct form_data_tag {
49+
char **urls;
50+
size_t count;
51+
} form_data_t;
52+
4853
bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
54+
bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data);
4955
bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
56+
bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data);
5057
void html_tag_arg_free(tag_arguments_t *tags);
5158
bool html_screnc_decode(fmap_t *map, const char *dirname);
5259
void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value);
5360

61+
void html_form_data_tag_free(form_data_t *tags);
62+
5463
#endif

libclamav/others.h

+1
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,7 @@ extern LIBCLAMAV_EXPORT int have_rar;
552552
#define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS)
553553
#define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE)
554554
#define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED)
555+
#define SCAN_STORE_HTML_URLS (ctx->options->general & CL_SCAN_GENERAL_STORE_HTML_URLS)
555556

556557
#define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE)
557558
#define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF)

0 commit comments

Comments
 (0)