Skip to content

Updating the library to Python 3 #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,20 @@ Eg: ~/cvpath/data/sample/a.pdf is parsed by
```bash
cvscan parse --name data/sample/a
```
Output of parsing the sample inside data/sample is also provided in the `output.json` file.

## Installing NLTK related packages.
You can either download all the NLTK packages are choose only the ones listed below to use the cvscan library.

```
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
```

Data Manipulations
===============
Expand Down
21 changes: 10 additions & 11 deletions cvscan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,18 @@
Main program

"""
import os

import converter
import annotations_parser
import details_parser as dp
import language_parser as lp
import json
import dirpath
import configurations
from cvscan import converter
from cvscan import annotations_parser
from cvscan import details_parser as dp
from cvscan import language_parser as lp
from cvscan import dirpath
from cvscan import configurations

class Cvscan():
def __init__(self, name, path = dirpath.RESUMEPATH):
self.path = path + '/' + name + '.pdf'
self.path = name + '.pdf'
self.path = os.path.join(path, name)

if self.exists():
self.extract()
Expand Down Expand Up @@ -56,7 +55,7 @@ def parse(self):

# TODO: Add more fetch here
def show(self):
return json.dumps({
return {
"name" : self.name,
"experience" : self.experience,
"address" : self.address,
Expand All @@ -71,4 +70,4 @@ def show(self):
"qualifications" : self.qualifications,
"qualifications_info" : self.degree_info,
"extra_info" : self.extra_info
})
}
6 changes: 3 additions & 3 deletions cvscan/annotations_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ def fetch_pdf_urls(file_name):
if type(link_object) is not dict:
link_object = link_object.resolve()
if link_object['A']['URI']:
links.append(link_object['A']['URI'])
links.append(link_object['A']['URI'].decode())
file_pointer.close()
return links

except Exception, exception_instance:
except Exception as exception_instance:
logging.error('Error while fetching URLs : '+str(exception_instance))
return ''
return []
4 changes: 2 additions & 2 deletions cvscan/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def add(org,skill,job,qual,extra):
_job = _job.split(':')
jobs[_job[0]] = _job[1]
except Exception:
print "Something wnet wrong: " + Exception
print("Something wnet wrong: " + Exception)
do.add_jobs(jobs)
if qual:
do.add_qualifications(qual.split(','))
Expand Down Expand Up @@ -128,4 +128,4 @@ def remove(org,skill,job,qual,extra):
if qual:
do.remove_qualifications(qual.split(','))
if extra:
do.remove_extra(extra.split(','))
do.remove_extra(extra.split(','))
12 changes: 5 additions & 7 deletions cvscan/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
import re
import logging

import configurations as regex
from cvscan import configurations as regex

# for converting pdfs to text
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
from io import StringIO

logging.basicConfig(level=logging.DEBUG)

Expand All @@ -33,10 +33,8 @@ def pdf_to_txt(file_name):
# Setting up pdf reader
pdf_resource_manager = PDFResourceManager()
return_string = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(pdf_resource_manager, return_string, codec=codec, \
laparams=laparams)
device = TextConverter(pdf_resource_manager, return_string, laparams=laparams)
interpreter = PDFPageInterpreter(pdf_resource_manager, device)

for page in PDFPage.get_pages(file_pointer, set(), maxpages=0, password="",
Expand All @@ -55,8 +53,8 @@ def pdf_to_txt(file_name):
pdf_txt = pdf_txt.replace("\r", "\n")
pdf_txt = re.sub(regex.bullet, " ", pdf_txt)

return pdf_txt.decode('ascii', errors='ignore')
return pdf_txt

except Exception, exception_instance:
except Exception as exception_instance:
logging.error('Error converting pdf to txt: '+str(exception_instance))
return ''
Binary file modified cvscan/data/address/district-states
Binary file not shown.
Binary file modified cvscan/data/address/pincode-district-state
Binary file not shown.
Binary file modified cvscan/data/address/pincodes
Binary file not shown.
Binary file modified cvscan/data/address/states
Binary file not shown.
2 changes: 0 additions & 2 deletions cvscan/data/extra/extra
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
(lp0
.
Binary file modified cvscan/data/job_positions/positions
Binary file not shown.
Binary file modified cvscan/data/organizations/avoid_organizations
Binary file not shown.
Binary file modified cvscan/data/organizations/explicit_organizations
Binary file not shown.
Binary file modified cvscan/data/qualifications/degree
Binary file not shown.
224 changes: 224 additions & 0 deletions cvscan/data/sample/output.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
{
"address":{
"district":"Chennai",
"pincode":"",
"state":"Tamil Nadu"
},
"current_employers":[
"Delta force",
"Software developer application developer",
"National institute"
],
"emails":[
"[email protected]"
],
"employers":[
"Nit",
"Algorithmic",
"Analysis",
"Mac",
"Computer networks Exchange student",
"Institute of mathematical sciences",
"Steganography multimedia systems",
"project Delta force",
"National institute"
],
"experience":5,
"extra_info":[

],
"job category":"Other",
"jobs":[
"Programmer",
"Application developer",
"Independent",
"Student",
"Research",
"Software developer",
"Member"
],
"name":"Lakshmanaram",
"phone_numbers":"+919043804100",
"qualifications":[
"B.Tech"
],
"qualifications_info":[
"IN COMPUTER SCIENCE AND"
],
"skills":[
"C",
"Databases",
"Engineering",
"Java",
"Linux",
"Management",
"Operating Systems",
"Research",
"SQL",
"Unix",
"ACM",
"API",
"Active",
"Activities",
"Adaptive",
"Advanced",
"Algorithms",
"Analysis",
"Android",
"Android Development",
"Android Studio",
"Apache",
"Apache Spark",
"App",
"Application",
"Applications",
"Architecture",
"Article",
"Audience",
"Award",
"Awards",
"Bash",
"Bronze",
"C",
"COM",
"Calls",
"Challenge",
"Chennai",
"Circuit",
"Circuit Design",
"Club",
"Code",
"Codes",
"Competitive",
"Computer Science",
"Contests",
"Core",
"Coursework",
"Cycle",
"Data",
"Data Structures",
"Database",
"Databases",
"Delta",
"Design",
"Design Analysis",
"Designing",
"Developer",
"Easy",
"Education",
"Engineering",
"Exp",
"Expert",
"Featured",
"Festivals",
"Git",
"Github",
"Gmail",
"Gold",
"Google",
"Graph Theory",
"Hobbies",
"Hosted",
"IO",
"Independent",
"India",
"Input",
"Institute",
"Integrations",
"Java",
"Languages",
"Learning",
"Lines",
"LinkedIn",
"Links",
"Linux",
"Logic",
"Mac",
"Machine Learning",
"Management",
"Medals",
"Mobile",
"Mobile Applications",
"Multimedia",
"N",
"National",
"ODD",
"ONE",
"Octave",
"Open Source",
"Operating",
"Operating Systems",
"Part",
"Participants",
"Play",
"Premier",
"Problems",
"Professors",
"Programming",
"Programming Languages",
"Puzzles",
"Python",
"PH",
"QR",
"Rating",
"Reduce",
"Registration",
"Related",
"Related Activities",
"Research",
"Review",
"Robotics",
"SQL",
"STARS",
"Science",
"Session",
"Set",
"Shell",
"Silver",
"Skills",
"Software",
"Source",
"Spark",
"Special",
"Special Interest",
"Steganography",
"Store",
"Structures",
"Studio",
"Sudoku",
"Systems",
"Talk",
"Teams",
"Techno",
"Technology",
"Template",
"Terminal",
"Theory",
"Track Changes",
"Trending",
"Tutorials",
"Ubuntu",
"Undergraduate",
"Unix",
"Vocabulary",
"Weekly",
"Word",
"World"
],
"urls":[
"mailto:[email protected]",
"https://github.com/lakshmanaram",
"https://in.linkedin.com/in/lakshmanaram",
"https://www.hackerrank.com/lakshmanaram",
"http://codeforces.com/profile/henry_colebrooke",
"https://www.codechef.com/users/suicune",
"https://github.com/lakshmanaram/incor",
"http://incor.readthedocs.io/en/latest/",
"https://github.com/lakshmanaram/Steganography",
"http://asp.eurasipjournals.springeropen.com/articles/10.1155/2010/876946",
"https://github.com/lakshmanaram/Pragyan-app",
"https://play.google.com/store/apps/details?id=org.pragyan.pragyantshirtapp&hl=en",
"https://github.com/badarsh2/Sudocabulary",
"http://www.omgubuntu.co.uk/2016/08/learn-new-word-terminal"
]
}
Binary file modified cvscan/data/skills/skills
Binary file not shown.
Loading