Skip to content

upgrade python env to 3.0 and tensorflow version to 2.2 #12

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions shifu-tensorflow-eval/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@
<classifier>hdp-yarn</classifier>
<outputDirectory>${project.build.directory}</outputDirectory>
</artifactItem>
<dependency>
<groupId>net.lingala.zip4j</groupId>
<artifactId>zip4j</artifactId>
<version>1.3.2</version>
</dependency>
<artifactItem>
<groupId>org.tensorflow</groupId>
<artifactId>tensorflow</artifactId>
Expand Down
36 changes: 12 additions & 24 deletions shifu-tensorflow-on-yarn/src/main/resources/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,21 @@
Author: Tommy Mulc
"""

import json
import logging
# from __future__ import print_function
import os
import tensorflow as tf
import argparse
import time
import sys
import logging
import gzip
from StringIO import StringIO
import random
import numpy as np
from tensorflow.python.platform import gfile
from tensorflow.python.framework import ops
from tensorflow.python.saved_model import builder
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import signature_def_utils
from tensorflow.python.saved_model import tag_constants
import json

import tensorflow as tf

REPLICAS_TO_AGGREGATE_RATIO = 1
FEATURE_COUNT = 30
HIDDEN_NODES_COUNT = 20
VALID_TRAINING_DATA_RATIO = 0.3
DELIMITER = '|'
BATCH_SIZE = 10
EPOCH = 10 # TODO: should consider recovery from checkpoint, we need to reduce current global step
EPOCH = 10 # TODO: should consider recovery from checkpoint, we need to reduce current global step
WORKING_DIR = "hdfs://horton/user/webai/.yarn/"

# read from env
Expand All @@ -49,23 +38,22 @@ def main(_):

# allows this node know about all other nodes
if job_name == 'ps': # checks if parameter server
server = tf.train.Server(cluster,
job_name="ps",
task_index=task_index)
server = tf.compat.v1.train.Server(cluster,
job_name="ps",
task_index=task_index)
server.join()
else: # it must be a worker server
logging.info("Loading data from worker index = %d" % task_index)

server = tf.train.Server(cluster,
job_name="worker",
task_index=task_index)
server = tf.compat.v1.train.Server(cluster,
job_name="worker",
task_index=task_index)

logging.info("backup worker join!!")

while True:
time.sleep(1000000)



if __name__ == '__main__':
tf.app.run()
tf.compat.v1.app.run()