diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..61c48e7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.idea/
+__pycache__/
+log/
diff --git a/batcher.py b/batcher.py
index 233800e..6a17273 100644
--- a/batcher.py
+++ b/batcher.py
@@ -16,7 +16,7 @@
 
 """This file contains code to process data into batches"""
 
-import Queue
+import queue as Queue
 from random import shuffle
 from threading import Thread
 import time
@@ -167,7 +167,7 @@ def init_encoder_seq(self, example_list, hps):
     for i, ex in enumerate(example_list):
       self.enc_batch[i, :] = ex.enc_input[:]
       self.enc_lens[i] = ex.enc_len
-      for j in xrange(ex.enc_len):
+      for j in range(ex.enc_len):
         self.enc_padding_mask[i][j] = 1
 
     # For pointer-generator mode, need to store some extra info
@@ -204,7 +204,7 @@ def init_decoder_seq(self, example_list, hps):
     for i, ex in enumerate(example_list):
       self.dec_batch[i, :] = ex.dec_input[:]
       self.target_batch[i, :] = ex.target[:]
-      for j in xrange(ex.dec_len):
+      for j in range(ex.dec_len):
         self.dec_padding_mask[i][j] = 1
 
   def store_orig_strings(self, example_list):
@@ -250,12 +250,12 @@ def __init__(self, data_path, vocab, hps, single_pass):
 
     # Start the threads that load the queues
     self._example_q_threads = []
-    for _ in xrange(self._num_example_q_threads):
+    for _ in range(self._num_example_q_threads):
       self._example_q_threads.append(Thread(target=self.fill_example_queue))
       self._example_q_threads[-1].daemon = True
       self._example_q_threads[-1].start()
     self._batch_q_threads = []
-    for _ in xrange(self._num_batch_q_threads):
+    for _ in range(self._num_batch_q_threads):
       self._batch_q_threads.append(Thread(target=self.fill_batch_queue))
       self._batch_q_threads[-1].daemon = True
       self._batch_q_threads[-1].start()
@@ -292,7 +292,7 @@ def fill_example_queue(self):
 
     while True:
       try:
-        (article, abstract) = input_gen.next() # read the next example from file. article and abstract are both strings.
+        (article, abstract) = next(input_gen) # read the next example from file. article and abstract are both strings.
       except StopIteration: # if there are no more examples:
         tf.logging.info("The example generator for this example queue filling thread has exhausted data.")
         if self._single_pass:
@@ -316,13 +316,13 @@ def fill_batch_queue(self):
       if self._hps.mode != 'decode':
         # Get bucketing_cache_size-many batches of Examples into a list, then sort
         inputs = []
-        for _ in xrange(self._hps.batch_size * self._bucketing_cache_size):
+        for _ in range(self._hps.batch_size * self._bucketing_cache_size):
           inputs.append(self._example_queue.get())
         inputs = sorted(inputs, key=lambda inp: inp.enc_len) # sort by length of encoder sequence
 
         # Group the sorted Examples into batches, optionally shuffle the batches, and place in the batch queue.
         batches = []
-        for i in xrange(0, len(inputs), self._hps.batch_size):
+        for i in range(0, len(inputs), self._hps.batch_size):
           batches.append(inputs[i:i + self._hps.batch_size])
         if not self._single_pass:
           shuffle(batches)
@@ -331,7 +331,7 @@ def fill_batch_queue(self):
 
       else: # beam search decode mode
         ex = self._example_queue.get()
-        b = [ex for _ in xrange(self._hps.batch_size)]
+        b = [ex for _ in range(self._hps.batch_size)]
         self._batch_queue.put(Batch(b, self._hps, self._vocab))
 
 
@@ -361,10 +361,10 @@ def text_generator(self, example_generator):
     Args:
       example_generator: a generator of tf.Examples from file. See data.example_generator"""
     while True:
-      e = example_generator.next() # e is a tf.Example
+      e = next(example_generator) # e is a tf.Example
       try:
-        article_text = e.features.feature['article'].bytes_list.value[0] # the article text was saved under the key 'article' in the data files
-        abstract_text = e.features.feature['abstract'].bytes_list.value[0] # the abstract text was saved under the key 'abstract' in the data files
+        article_text = e.features.feature['article'].bytes_list.value[0].decode() # the article text was saved under the key 'article' in the data files
+        abstract_text = e.features.feature['abstract'].bytes_list.value[0].decode() # the abstract text was saved under the key 'abstract' in the data files
       except ValueError:
         tf.logging.error('Failed to get article or abstract from example')
         continue
diff --git a/beam_search.py b/beam_search.py
index ff3e328..d2c2899 100644
--- a/beam_search.py
+++ b/beam_search.py
@@ -102,13 +102,13 @@ def run_beam_search(sess, model, vocab, batch):
                      attn_dists=[],
                      p_gens=[],
                      coverage=np.zeros([batch.enc_batch.shape[1]]) # zero vector of length attention_length
-                     ) for _ in xrange(FLAGS.beam_size)]
+                     ) for _ in range(FLAGS.beam_size)]
   results = [] # this will contain finished hypotheses (those that have emitted the [STOP] token)
 
   steps = 0
   while steps < FLAGS.max_dec_steps and len(results) < FLAGS.beam_size:
     latest_tokens = [h.latest_token for h in hyps] # latest token produced by each hypothesis
-    latest_tokens = [t if t in xrange(vocab.size()) else vocab.word2id(data.UNKNOWN_TOKEN) for t in latest_tokens] # change any in-article temporary OOV ids to [UNK] id, so that we can lookup word embeddings
+    latest_tokens = [t if t in range(vocab.size()) else vocab.word2id(data.UNKNOWN_TOKEN) for t in latest_tokens] # change any in-article temporary OOV ids to [UNK] id, so that we can lookup word embeddings
     states = [h.state for h in hyps] # list of current decoder states of the hypotheses
     prev_coverage = [h.coverage for h in hyps] # list of coverage vectors (or None)
 
@@ -123,9 +123,9 @@ def run_beam_search(sess, model, vocab, batch):
     # Extend each hypothesis and collect them all in all_hyps
     all_hyps = []
     num_orig_hyps = 1 if steps == 0 else len(hyps) # On the first step, we only had one original hypothesis (the initial hypothesis). On subsequent steps, all original hypotheses are distinct.
-    for i in xrange(num_orig_hyps):
+    for i in range(num_orig_hyps):
       h, new_state, attn_dist, p_gen, new_coverage_i = hyps[i], new_states[i], attn_dists[i], p_gens[i], new_coverage[i]  # take the ith hypothesis and new decoder state info
-      for j in xrange(FLAGS.beam_size * 2):  # for each of the top 2*beam_size hyps:
+      for j in range(FLAGS.beam_size * 2):  # for each of the top 2*beam_size hyps:
         # Extend the ith hypothesis with the jth option
         new_hyp = h.extend(token=topk_ids[i, j],
                            log_prob=topk_log_probs[i, j],
diff --git a/data.py b/data.py
index 93180fe..c3839ef 100644
--- a/data.py
+++ b/data.py
@@ -58,7 +58,7 @@ def __init__(self, vocab_file, max_size):
       for line in vocab_f:
         pieces = line.split()
         if len(pieces) != 2:
-          print 'Warning: incorrectly formatted line in vocabulary file: %s\n' % line
+          print('Warning: incorrectly formatted line in vocabulary file: %s\n' % line)
           continue
         w = pieces[0]
         if w in [SENTENCE_START, SENTENCE_END, UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]:
@@ -69,10 +69,10 @@ def __init__(self, vocab_file, max_size):
         self._id_to_word[self._count] = w
         self._count += 1
         if max_size != 0 and self._count >= max_size:
-          print "max_size of vocab was specified as %i; we now have %i words. Stopping reading." % (max_size, self._count)
+          print("max_size of vocab was specified as %i; we now have %i words. Stopping reading." % (max_size, self._count))
           break
 
-    print "Finished constructing vocabulary of %i total words. Last word added: %s" % (self._count, self._id_to_word[self._count-1])
+    print("Finished constructing vocabulary of %i total words. Last word added: %s" % (self._count, self._id_to_word[self._count-1]))
 
   def word2id(self, word):
     """Returns the id (integer) of a word (string). Returns [UNK] id if word is OOV."""
@@ -97,11 +97,11 @@ def write_metadata(self, fpath):
     Args:
       fpath: place to write the metadata file
     """
-    print "Writing word embedding metadata file to %s..." % (fpath)
+    print("Writing word embedding metadata file to %s..." % (fpath))
     with open(fpath, "w") as f:
       fieldnames = ['word']
       writer = csv.DictWriter(f, delimiter="\t", fieldnames=fieldnames)
-      for i in xrange(self.size()):
+      for i in range(self.size()):
         writer.writerow({"word": self._id_to_word[i]})
 
 
@@ -137,7 +137,7 @@ def example_generator(data_path, single_pass):
         example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
         yield example_pb2.Example.FromString(example_str)
     if single_pass:
-      print "example_generator completed reading all datafiles. No more data."
+      print("example_generator completed reading all datafiles. No more data.")
       break
 
 
diff --git a/decode.py b/decode.py
index 90b5aec..06f573d 100644
--- a/decode.py
+++ b/decode.py
@@ -191,11 +191,11 @@ def write_for_attnvis(self, article, abstract, decoded_words, attn_dists, p_gens
 
 def print_results(article, abstract, decoded_output):
   """Prints the article, the reference summmary and the decoded summary to screen"""
-  print ""
+  print("---------------------------------------------------------------------------")
   tf.logging.info('ARTICLE:  %s', article)
   tf.logging.info('REFERENCE SUMMARY: %s', abstract)
   tf.logging.info('GENERATED SUMMARY: %s', decoded_output)
-  print ""
+  print("---------------------------------------------------------------------------")
 
 
 def make_html_safe(s):
diff --git a/inspect_checkpoint.py b/inspect_checkpoint.py
index 58d85a8..f97c71d 100644
--- a/inspect_checkpoint.py
+++ b/inspect_checkpoint.py
@@ -29,17 +29,16 @@
       else:
         some_infnan.append(key)
 
-  print "\nFINITE VARIABLES:"
-  for key in finite: print key
+  print("\nFINITE VARIABLES:")
+  for key in finite: print(key)
 
-  print "\nVARIABLES THAT ARE ALL INF/NAN:"
-  for key in all_infnan: print key
+  print("\nVARIABLES THAT ARE ALL INF/NAN:")
+  for key in all_infnan: print(key)
 
-  print "\nVARIABLES THAT CONTAIN SOME FINITE, SOME INF/NAN VALUES:"
-  for key in some_infnan: print key
+  print("\nVARIABLES THAT CONTAIN SOME FINITE, SOME INF/NAN VALUES:")
+  for key in some_infnan: print(key)
 
-  print ""
   if not all_infnan and not some_infnan:
-    print "CHECK PASSED: checkpoint contains no inf/NaN values"
+    print("CHECK PASSED: checkpoint contains no inf/NaN values")
   else:
-    print "CHECK FAILED: checkpoint contains some inf/NaN values"
+    print("CHECK FAILED: checkpoint contains some inf/NaN values")
diff --git a/model.py b/model.py
index 1cae561..51c8c4e 100644
--- a/model.py
+++ b/model.py
@@ -420,7 +420,7 @@ def decode_onestep(self, sess, batch, latest_tokens, enc_states, dec_init_states
     results = sess.run(to_return, feed_dict=feed) # run the decoder step
 
     # Convert results['states'] (a single LSTMStateTuple) into a list of LSTMStateTuple -- one for each hypothesis
-    new_states = [tf.contrib.rnn.LSTMStateTuple(results['states'].c[i, :], results['states'].h[i, :]) for i in xrange(beam_size)]
+    new_states = [tf.contrib.rnn.LSTMStateTuple(results['states'].c[i, :], results['states'].h[i, :]) for i in range(beam_size)]
 
     # Convert singleton list containing a tensor to a list of k arrays
     assert len(results['attn_dists'])==1
@@ -431,14 +431,14 @@ def decode_onestep(self, sess, batch, latest_tokens, enc_states, dec_init_states
       assert len(results['p_gens'])==1
       p_gens = results['p_gens'][0].tolist()
     else:
-      p_gens = [None for _ in xrange(beam_size)]
+      p_gens = [None for _ in range(beam_size)]
 
     # Convert the coverage tensor to a list length k containing the coverage vector for each hypothesis
     if FLAGS.coverage:
       new_coverage = results['coverage'].tolist()
       assert len(new_coverage) == beam_size
     else:
-      new_coverage = [None for _ in xrange(beam_size)]
+      new_coverage = [None for _ in range(beam_size)]
 
     return results['ids'], results['probs'], new_states, attn_dists, p_gens, new_coverage
 
diff --git a/run_summarization.py b/run_summarization.py
index b529719..b89e3bd 100644
--- a/run_summarization.py
+++ b/run_summarization.py
@@ -107,22 +107,22 @@ def restore_best_model():
 
   # Initialize all vars in the model
   sess = tf.Session(config=util.get_config())
-  print "Initializing all variables..."
+  print("Initializing all variables...")
   sess.run(tf.initialize_all_variables())
 
   # Restore the best model from eval dir
   saver = tf.train.Saver([v for v in tf.all_variables() if "Adagrad" not in v.name])
-  print "Restoring all non-adagrad variables from best model in eval dir..."
+  print("Restoring all non-adagrad variables from best model in eval dir...")
   curr_ckpt = util.load_ckpt(saver, sess, "eval")
-  print "Restored %s." % curr_ckpt
+  print ("Restored %s." % curr_ckpt)
 
   # Save this model to train dir and quit
   new_model_name = curr_ckpt.split("/")[-1].replace("bestmodel", "model")
   new_fname = os.path.join(FLAGS.log_root, "train", new_model_name)
-  print "Saving model to %s..." % (new_fname)
+  print ("Saving model to %s..." % (new_fname))
   new_saver = tf.train.Saver() # this saver saves all variables that now exist, including Adagrad variables
   new_saver.save(sess, new_fname)
-  print "Saved."
+  print ("Saved.")
   exit()
 
 
@@ -132,21 +132,21 @@ def convert_to_coverage_model():
 
   # initialize an entire coverage model from scratch
   sess = tf.Session(config=util.get_config())
-  print "initializing everything..."
+  print("initializing everything...")
   sess.run(tf.global_variables_initializer())
 
   # load all non-coverage weights from checkpoint
   saver = tf.train.Saver([v for v in tf.global_variables() if "coverage" not in v.name and "Adagrad" not in v.name])
-  print "restoring non-coverage variables..."
+  print("restoring non-coverage variables...")
   curr_ckpt = util.load_ckpt(saver, sess)
-  print "restored."
+  print("restored.")
 
   # save this model and quit
   new_fname = curr_ckpt + '_cov_init'
-  print "saving model to %s..." % (new_fname)
+  print("saving model to %s..." % (new_fname))
   new_saver = tf.train.Saver() # this one will save all variables that now exist
   new_saver.save(sess, new_fname)
-  print "saved."
+  print("saved.")
   exit()
 
 
@@ -294,7 +294,7 @@ def main(unused_argv):
   # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
   hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen']
   hps_dict = {}
-  for key,val in FLAGS.__flags.iteritems(): # for each flag
+  for key,val in FLAGS.__flags.items(): # for each flag
     if key in hparam_list: # if it's in the list
       hps_dict[key] = val # add it to the dict
   hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)
@@ -305,7 +305,7 @@ def main(unused_argv):
   tf.set_random_seed(111) # a seed value for randomness
 
   if hps.mode == 'train':
-    print "creating model..."
+    print("creating model...")
     model = SummarizationModel(hps, vocab)
     setup_training(model, batcher)
   elif hps.mode == 'eval':