From c2a89b89689c68b4e12877186780ae367dcc570e Mon Sep 17 00:00:00 2001 From: flgc Date: Sun, 18 Aug 2019 11:52:44 +0900 Subject: [PATCH 1/6] enh: Add low hanging fruit from GO version --- filetype/types/application.py | 34 ++++++++++ filetype/types/archive.py | 88 +++++++++++++++++++++++++ filetype/types/audio.py | 21 ++++++ filetype/types/document.py | 118 ++++++++++++++++++++++++++++++++++ filetype/types/image.py | 27 ++++---- filetype/types/video.py | 94 +++++++++++++++++---------- 6 files changed, 335 insertions(+), 47 deletions(-) create mode 100644 filetype/types/application.py create mode 100644 filetype/types/document.py diff --git a/filetype/types/application.py b/filetype/types/application.py new file mode 100644 index 0000000..b9ac5f9 --- /dev/null +++ b/filetype/types/application.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +from .base import Type + + +class Wasm(Type): + """ + Implements the WASM Web Assembly 1.0 filetype. + + WASM has starts with `\0asm`, followed by the version. + http://webassembly.github.io/spec/core/binary/modules.html#binary-magic + """ + + MIME = 'application/wasm' + EXTENSION = 'wasm' + + def __init__(self): + super(Wasm, self).__init__( + mime=Wasm.MIME, + extension=Wasm.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 8 and + buf[0] == 0x00 and + buf[1] == 0x61 and + buf[2] == 0x73 and + buf[3] == 0x6D and + buf[4] == 0x01 and + buf[5] == 0x00 and + buf[6] == 0x00 and + buf[7] == 0x00) diff --git a/filetype/types/archive.py b/filetype/types/archive.py index 5cde4b3..8a5eff7 100644 --- a/filetype/types/archive.py +++ b/filetype/types/archive.py @@ -513,3 +513,91 @@ def match(self, buf): buf[1] == 0x5A and buf[2] == 0x49 and buf[3] == 0x50) + + +class Rpm(Type): + """ + Implements the RPM archive type matcher. + """ + MIME = 'application/x-rpm' + EXTENSION = 'rpm' + + def __init__(self): + super(Rpm, self).__init__( + mime=Rpm.MIME, + extension=Rpm.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 96 and + buf[0] == 0xED and + buf[1] == 0xAB and + buf[2] == 0xEE and + buf[3] == 0xDB) + + + +class Elf(Type): + """ + Implements the Elf archive type matcher. + """ + MIME = 'application/x-executable' + EXTENSION = 'elf' + + def __init__(self): + super(Elf, self).__init__( + mime=Elf.MIME, + extension=Elf.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 52 and + buf[0] == 0x7F and + buf[1] == 0x45 and + buf[2] == 0x4C and + buf[3] == 0x46) + + + +class Dcm(Type): + """ + Implements the Dcm archive type matcher. + """ + MIME = 'application/dicom' + EXTENSION = 'dcm' + + def __init__(self): + super(Dcm, self).__init__( + mime=Dcm.MIME, + extension=Dcm.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 131 and + buf[128] == 0x44 and + buf[129] == 0x49 and + buf[130] == 0x43 and + buf[131] == 0x4D) + + + +class Iso(Type): + """ + Implements the ISO archive type matcher. + """ + MIME = 'application/x-iso9660-image' + EXTENSION = 'iso' + + def __init__(self): + super(Iso, self).__init__( + mime=Iso.MIME, + extension=Iso.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 32773 and + buf[32769] == 0x43 and + buf[32770] == 0x44 and + buf[32771] == 0x30 and + buf[32772] == 0x30 and + buf[32773] == 0x31) diff --git a/filetype/types/audio.py b/filetype/types/audio.py index 5dafba5..c8c5644 100644 --- a/filetype/types/audio.py +++ b/filetype/types/audio.py @@ -164,3 +164,24 @@ def match(self, buf): buf[3] == 0x4D and buf[4] == 0x52 and buf[5] == 0x0A) + + +class Aac(Type): + """ + Implements the AAC audio type matcher. + """ + MIME = 'audio/aac' + EXTENSION = 'aac' + + def __init__(self): + super(Aac, self).__init__( + mime=Aac.MIME, + extension=Aac.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 1 and + ((buf[0] == 0xFF and + buf[1] == 0xF1) or + (buf[0] == 0xFF and + buf[1] == 0xF9))) diff --git a/filetype/types/document.py b/filetype/types/document.py new file mode 100644 index 0000000..afad633 --- /dev/null +++ b/filetype/types/document.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +from .base import Type + + +class Doc(Type): + """ + Implements Doc file type matcher. + """ + + MIME = 'application/msword' + EXTENSION = 'doc' + + def __init__(self): + super(Doc, self).__init__( + mime=Doc.MIME, + extension=Doc.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 7 and + buf[0] == 0xD0 and + buf[1] == 0xCF and + buf[2] == 0x11 and + buf[3] == 0xE0 and + buf[4] == 0xA1 and + buf[5] == 0xB1 and + buf[6] == 0x1A and + buf[7] == 0xE1) + + +# todo: Docx + + +class Xls(Type): + """ + Implements Xls file type matcher. + Which looks just like the Doc file type matcher.. + """ + + MIME = 'application/vnd.ms-excel' + EXTENSION = 'xls' + + def __init__(self): + super(Xls, self).__init__( + mime=Xls.MIME, + extension=Xls.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 7 and + buf[0] == 0xD0 and + buf[1] == 0xCF and + buf[2] == 0x11 and + buf[3] == 0xE0 and + buf[4] == 0xA1 and + buf[5] == 0xB1 and + buf[6] == 0x1A and + buf[7] == 0xE1) + + +class Xls(Type): + """ + Implements Xls file type matcher. + Which looks like the Doc file type matcher.. + """ + + MIME = 'application/vnd.ms-excel' + EXTENSION = 'xls' + + def __init__(self): + super(Xls, self).__init__( + mime=Xls.MIME, + extension=Xls.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 7 and + buf[0] == 0xD0 and + buf[1] == 0xCF and + buf[2] == 0x11 and + buf[3] == 0xE0 and + buf[4] == 0xA1 and + buf[5] == 0xB1 and + buf[6] == 0x1A and + buf[7] == 0xE1) + + +# todo: Xlsx + + +class Ppt(Type): + """ + Implements Ppt file type matcher. + Which looks like the Doc and Xls file type matchers.. + """ + + MIME = 'application/vnd.ms-powerpoint' + EXTENSION = 'ppt' + + def __init__(self): + super(Ppt, self).__init__( + mime=Ppt.MIME, + extension=Ppt.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 7 and + buf[0] == 0xD0 and + buf[1] == 0xCF and + buf[2] == 0x11 and + buf[3] == 0xE0 and + buf[4] == 0xA1 and + buf[5] == 0xB1 and + buf[6] == 0x1A and + buf[7] == 0xE1) diff --git a/filetype/types/image.py b/filetype/types/image.py index 1fd6e17..7fe107d 100644 --- a/filetype/types/image.py +++ b/filetype/types/image.py @@ -259,21 +259,22 @@ def match(self, buf): return False -class Dcm(Type): - - MIME = 'application/dicom' - EXTENSION = 'dcm' - OFFSET = 128 +class Dwg(Type): + """ + Implements the DWG image type matcher. + """ + MIME = 'image/vnd.dwg' + EXTENSION = 'dwg' def __init__(self): - super(Dcm, self).__init__( - mime=Dcm.MIME, - extension=Dcm.EXTENSION + super(Dwg, self).__init__( + mime=Dwg.MIME, + extension=Dwg.EXTENSION, ) def match(self, buf): - return (len(buf) > Dcm.OFFSET + 4 and - buf[Dcm.OFFSET + 0] == 0x44 and - buf[Dcm.OFFSET + 1] == 0x49 and - buf[Dcm.OFFSET + 2] == 0x43 and - buf[Dcm.OFFSET + 3] == 0x4D) + return (len(buf) > 3 and + buf[0] == 0x41 and + buf[1] == 0x43 and + buf[2] == 0x31 and + buf[3] == 0x30) diff --git a/filetype/types/video.py b/filetype/types/video.py index 9955397..2e370d1 100644 --- a/filetype/types/video.py +++ b/filetype/types/video.py @@ -6,27 +6,6 @@ from .isobmff import IsoBmff -class Mp4(IsoBmff): - """ - Implements the MP4 video type matcher. - """ - MIME = 'video/mp4' - EXTENSION = 'mp4' - - def __init__(self): - super(Mp4, self).__init__( - mime=Mp4.MIME, - extension=Mp4.EXTENSION - ) - - def match(self, buf): - if not self._is_isobmff(buf): - return False - - major_brand, minor_version, compatible_brands = self._get_ftyp(buf) - return major_brand in ['mp41', 'mp42'] - - class M4v(Type): """ Implements the M4V video type matcher. @@ -64,6 +43,8 @@ def __init__(self): ) def match(self, buf): + # todo: GO source checks first 4 bytes, then searches next 4096 for 'matroska', + # and finally checks if bytes 5 & 6 below are located prior to 'matroska'. return ((len(buf) > 15 and buf[0] == 0x1A and buf[1] == 0x45 and buf[2] == 0xDF and buf[3] == 0xA3 and @@ -173,6 +154,28 @@ def match(self, buf): buf[9] == 0xD9) +class Mpeg(Type): + """ + Implements the MPEG video type matcher. + """ + MIME = 'video/mpeg' + EXTENSION = 'mpg' + + def __init__(self): + super(Mpeg, self).__init__( + mime=Mpeg.MIME, + extension=Mpeg.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 3 and + buf[0] == 0x0 and + buf[1] == 0x0 and + buf[2] == 0x1 and + buf[3] >= 0xb0 and + buf[3] <= 0xbf) + + class Flv(Type): """ Implements the FLV video type matcher. @@ -194,23 +197,46 @@ def match(self, buf): buf[3] == 0x01) -class Mpeg(Type): +class Mp4(IsoBmff): """ - Implements the MPEG video type matcher. + Implements the MP4 video type matcher. """ - MIME = 'video/mpeg' - EXTENSION = 'mpg' + MIME = 'video/mp4' + EXTENSION = 'mp4' def __init__(self): - super(Mpeg, self).__init__( - mime=Mpeg.MIME, - extension=Mpeg.EXTENSION + super(Mp4, self).__init__( + mime=Mp4.MIME, + extension=Mp4.EXTENSION ) def match(self, buf): - return (len(buf) > 3 and - buf[0] == 0x0 and - buf[1] == 0x0 and - buf[2] == 0x1 and - buf[3] >= 0xb0 and - buf[3] <= 0xbf) + if not self._is_isobmff(buf): + return False + + major_brand, minor_version, compatible_brands = self._get_ftyp(buf) + return major_brand in ['mp41', 'mp42'] + + +class Match3gp(Type): + """ + Implements the 3GB video type matcher. + """ + MIME = 'video/3gpp' + EXTENSION = '3gp' + + def __init__(self): + super(Match3gp, self).__init__( + mime=Match3gp.MIME, + extension=Match3gp.EXTENSION + ) + + def match(self, buf): + return (len(buf) > 10 and + buf[4] == 0x66 and + buf[5] == 0x74 and + buf[6] == 0x79 and + buf[7] == 0x70 and + buf[8] == 0x33 and + buf[9] == 0x67 and + buf[10] == 0x70) From fa1d9af62e5ed776ea4c44fd3f760145d302374c Mon Sep 17 00:00:00 2001 From: flgc Date: Sun, 18 Aug 2019 12:03:15 +0900 Subject: [PATCH 2/6] bug: Replace Dcm with Dwg under IMAGES --- filetype/types/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filetype/types/__init__.py b/filetype/types/__init__.py index 87d95b8..8457328 100644 --- a/filetype/types/__init__.py +++ b/filetype/types/__init__.py @@ -23,7 +23,7 @@ image.Psd(), image.Ico(), image.Heic(), - image.Dcm(), + image.Dwg(), ) # Supported video types From 08bee2021bd1f938b0b19989e8627e58b5bcc281 Mon Sep 17 00:00:00 2001 From: flgc Date: Sun, 18 Aug 2019 12:19:09 +0900 Subject: [PATCH 3/6] bug: Update __init__ for new types --- filetype/types/__init__.py | 27 ++++++++++++++++++++++++--- filetype/types/archive.py | 3 --- filetype/types/document.py | 27 --------------------------- 3 files changed, 24 insertions(+), 33 deletions(-) diff --git a/filetype/types/__init__.py b/filetype/types/__init__.py index 8457328..4836c20 100644 --- a/filetype/types/__init__.py +++ b/filetype/types/__init__.py @@ -2,8 +2,10 @@ from __future__ import absolute_import +from . import application from . import archive from . import audio +from . import document from . import font from . import image from . import video @@ -28,15 +30,16 @@ # Supported video types VIDEO = ( - video.Mp4(), video.M4v(), video.Mkv(), + video.Webm(), video.Mov(), video.Avi(), video.Wmv(), video.Mpeg(), - video.Webm(), video.Flv(), + video.Mp4(), + video.Match3gp(), ) # Supported audio types @@ -48,6 +51,7 @@ audio.Flac(), audio.Wav(), audio.Amr(), + audio.Aac(), ) # Supported font types @@ -77,7 +81,24 @@ archive.Ar(), archive.Z(), archive.Lz(), + archive.Rpm(), + archive.Elf(), + archive.Dcm(), + archive.Iso(), ) +# Supported application types +APPLICATION = ( + application.Wasm(), +) + +# Supported application types +DOCUMENT = ( + document.Doc(), + document.Xls(), + document.Ppt(), +) + + # Expose supported type matchers -TYPES = list(VIDEO + IMAGE + AUDIO + FONT + ARCHIVE) +TYPES = list(VIDEO + IMAGE + AUDIO + FONT + ARCHIVE + APPLICATION + DOCUMENT) diff --git a/filetype/types/archive.py b/filetype/types/archive.py index 8a5eff7..2e10ec9 100644 --- a/filetype/types/archive.py +++ b/filetype/types/archive.py @@ -536,7 +536,6 @@ def match(self, buf): buf[3] == 0xDB) - class Elf(Type): """ Implements the Elf archive type matcher. @@ -558,7 +557,6 @@ def match(self, buf): buf[3] == 0x46) - class Dcm(Type): """ Implements the Dcm archive type matcher. @@ -580,7 +578,6 @@ def match(self, buf): buf[131] == 0x4D) - class Iso(Type): """ Implements the ISO archive type matcher. diff --git a/filetype/types/document.py b/filetype/types/document.py index afad633..aea313b 100644 --- a/filetype/types/document.py +++ b/filetype/types/document.py @@ -34,33 +34,6 @@ def match(self, buf): # todo: Docx -class Xls(Type): - """ - Implements Xls file type matcher. - Which looks just like the Doc file type matcher.. - """ - - MIME = 'application/vnd.ms-excel' - EXTENSION = 'xls' - - def __init__(self): - super(Xls, self).__init__( - mime=Xls.MIME, - extension=Xls.EXTENSION - ) - - def match(self, buf): - return (len(buf) > 7 and - buf[0] == 0xD0 and - buf[1] == 0xCF and - buf[2] == 0x11 and - buf[3] == 0xE0 and - buf[4] == 0xA1 and - buf[5] == 0xB1 and - buf[6] == 0x1A and - buf[7] == 0xE1) - - class Xls(Type): """ Implements Xls file type matcher. From e65c1117fdd0bbc3c6e7b4e8c545263568dda0ee Mon Sep 17 00:00:00 2001 From: flgc Date: Sun, 18 Aug 2019 12:28:03 +0900 Subject: [PATCH 4/6] bug: Import types in match.py --- filetype/match.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/filetype/match.py b/filetype/match.py index f1d9891..8a60802 100644 --- a/filetype/match.py +++ b/filetype/match.py @@ -2,8 +2,10 @@ from __future__ import absolute_import +from .types import APPLICATION as application_matchers from .types import ARCHIVE as archive_matchers from .types import AUDIO as audio_matchers +from .types import DOCUMENT as document_matchers from .types import FONT as font_matchers from .types import IMAGE as image_matchers from .types import VIDEO as video_matchers From 5091580fe03b38f63084c6ca0276d2f291c812d9 Mon Sep 17 00:00:00 2001 From: flgc Date: Sun, 18 Aug 2019 12:37:09 +0900 Subject: [PATCH 5/6] enh: Add new helpers and matchers for new types --- filetype/helpers.py | 32 ++++++++++++++++++++++++++++++++ filetype/match.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/filetype/helpers.py b/filetype/helpers.py index 9aafa29..a3e170f 100644 --- a/filetype/helpers.py +++ b/filetype/helpers.py @@ -120,3 +120,35 @@ def is_font(obj): TypeError: if obj is not a supported type. """ return match.font(obj) is not None + + +def is_application(obj): + """ + Checks if a given input is a supported type application. + + Args: + obj: path to file, bytes or bytearray. + + Returns: + True if obj is a valid font. Otherwise False. + + Raises: + TypeError: if obj is not a supported type. + """ + return match.application(obj) is not None + + +def is_document(obj): + """ + Checks if a given input is a supported type document. + + Args: + obj: path to file, bytes or bytearray. + + Returns: + True if obj is a valid font. Otherwise False. + + Raises: + TypeError: if obj is not a supported type. + """ + return match.document(obj) is not None diff --git a/filetype/match.py b/filetype/match.py index 8a60802..de2d1fe 100644 --- a/filetype/match.py +++ b/filetype/match.py @@ -119,3 +119,37 @@ def archive(obj): TypeError: if obj is not a supported type. """ return match(obj, archive_matchers) + + +def application(obj): + """ + Matches the given input againts the available + application type matchers. + + Args: + obj: path to file, bytes or bytearray. + + Returns: + Type instance if matches. Otherwise None. + + Raises: + TypeError: if obj is not a supported type. + """ + return match(obj, application_matchers) + + +def document(obj): + """ + Matches the given input againts the available + document type matchers. + + Args: + obj: path to file, bytes or bytearray. + + Returns: + Type instance if matches. Otherwise None. + + Raises: + TypeError: if obj is not a supported type. + """ + return match(obj, document_matchers) From 1937d2b22b6852655e79f40550306a5e7282e9c5 Mon Sep 17 00:00:00 2001 From: flgc Date: Sun, 18 Aug 2019 12:46:23 +0900 Subject: [PATCH 6/6] chores: Add new detected file types to docs --- README.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.rst b/README.rst index 529ac4a..b57a54e 100644 --- a/README.rst +++ b/README.rst @@ -74,6 +74,7 @@ Image - **psd** - ``image/vnd.adobe.photoshop`` - **ico** - ``image/x-icon`` - **heic** - ``image/heic`` +- **dwg** - ``image/vnd.dwg`` Video ^^^^^ @@ -87,6 +88,7 @@ Video - **wmv** - ``video/x-ms-wmv`` - **mpg** - ``video/mpeg`` - **flv** - ``video/x-flv`` +- **3gp** - ``video/3gpp`` Audio ^^^^^ @@ -98,6 +100,13 @@ Audio - **flac** - ``audio/x-flac`` - **wav** - ``audio/x-wav`` - **amr** - ``audio/amr`` +- **aac** - ``audio/aac`` + +Application +^^^^^^^^^^^ + +- **wasm** - ``application/wasm`` + Archive ^^^^^^^ @@ -126,6 +135,11 @@ Archive - **Z** - ``application/x-compress`` - **lz** - ``application/x-lzip`` +- **rpm** - ``application/x-rpm`` +- **elf** - ``application/x-executable`` +- **dcm** - ``application/dicom`` +- **iso** - ``application/x-iso9660-image`` + Font ^^^^ @@ -134,6 +148,13 @@ Font - **ttf** - ``application/font-sfnt`` - **otf** - ``application/font-sfnt`` +Document +^^^^^^^^ +- **doc** - ``application/msword`` +- **xls** - ``application/vnd.ms-excel`` +- **ppt** - ``application/vnd.ms-powerpoint`` + + .. _Python: http://python.org .. _magic numbers: https://en.wikipedia.org/wiki/Magic_number_(programming)#Magic_numbers_in_files .. _filetype: https://github.com/h2non/filetype