#!/usr/bin/env python # ---------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- # This is a test driver for running end-to-end CNTK tests # # ----- Running a test and/or updating baselines ------ # For instructions see: # ./TestDriver.py --help # # ---- Adding the tests: ------- # File system organization: # Each test suite (e.g. Speech) has its own directory inside Tests # Each test (e.g. QuickE2E) has its own directory within test suite # # Each test directory has a following components: # - testcases.yml - main test confuguration file, whcih defines all test cases # - run-test - (run-test) script # - baseline*.txt - baseline files whith a captured expected output of run-test script # # ----- testcases.yml format ------- # dataDir: # # - # - .... # # testCases: # : # patterns: # - # see pattern language # - # - ..... # # : # patterns: # - # - # - ..... # ..... # # ----- pattern language -------- # Multpile patterns of the same testcase are matching a *single* line of text # Pattern is essentiually a substring which has to be found in a line # if pattern starts with ^ then matching is constrained to look only at the beginning of the line # # pattern can have one or multiple placelohders wrapped with double-curly braces: {{...}} # this placeholders can match any text conforming to the type constraint. Available placeholders # {{integer}} - matches any (positive or negative integer) value # {{float}} - matches any float value # {{float,tolerance=0.00001}} - matches float value with given absolute tolerance: 0.00001 in this example # {{float,tolerance=2%}} - matches float value with relative tolerance, 2% in this example # # At runtime patterns are compiled by TestDriver.py to regular expressions # # ---- Baseline files ---- # Order of searching baseline files, depends on the current mode for a given test: # # 1. baseline....txt # 2. baseline...txt # 3. baseline...txt # 4. baseline..txt # 5. baseline...txt # 6. baseline..txt # 7. baseline..txt # 8. baseline.txt # where = { debug | release } # = { cpu | gpu } # # Baseline files are optional. They only evaluate if test defines one or more pattern-drivern test cases. # If no test cases are defined, then TestDriver uses exit code of the run-test script as the only criteria # of successful copmpletion of the test. # ----- Tagging system ------ # Unit tests can be optionally tagged with 1 or many tags # CNTK build/test lab uses those tags to understand which tests to run during different flavors of build jobs (nightly, BVT, checkin) # # Tag can be optionally predicated with a python boolean expression over 'flavor' (debug/release), 'device' (cpu/gpu), 'os' (windows/linux) variables. # this allows to restrict tagging of the test to specific combinations of those variables # # ----- Algorithm ------ # Baseline verification: # For each testcase # - filter all lines which matches # - if no lines found then abord with an error - since either baseline and/or pattern are invalid # Running test: # Run test script (run-test) and capture output: # # For each testcase # - filter all matching lines from baseline # - filter all matching lines from test output # - compare filtered lines one by one, ensuring that substrings defined by patterns are matching # # In practice, TestDriver performs 1 pass through the output of run-test performing a real-time # matching against all test-cases/pattern simulteneously # import sys, os, argparse, traceback, yaml, subprocess, random, re, time, sets thisDir = os.path.dirname(os.path.realpath(__file__)) windows = os.getenv("OS")=="Windows_NT" # This class encapsulates an instance of the test class Test: # "Suite/TestName" => instance of Test allTestsIndexedByFullName = {} # suite - name of the test suite # name - name of the test # path to the testcases.yml file def __init__(self, suite, name, pathToYmlFile): self.suite = suite self.name = name self.fullName = suite + "/" + name # computing location of test directory (yml file directory) self.testDir = os.path.dirname(pathToYmlFile) # parsing yml file with testcases with open(pathToYmlFile, "r") as f: self.rawYamlData = yaml.safe_load(f.read()) # finding location of data directory if self.rawYamlData["dataDir"]: self.dataDir = os.path.realpath(os.path.join(self.testDir, self.rawYamlData["dataDir"])) else: self.dataDir = self.testDir # parsing test cases self.testCases = [] if "testCases" in self.rawYamlData.keys(): testCasesYaml = self.rawYamlData["testCases"] for name in testCasesYaml.keys(): try: self.testCases.append(TestCase(name, testCasesYaml[name])) except Exception as e: print >>sys.stderr, "ERROR registering test case: " + name raise # parsing all tags, example input: # tags: # - bvt-l (flavor=='debug') ^ (device=='cpu') # tag with a python predicate expression # - nightly-l #tag without a predicate # # Predicate expressions must produce boolean value and may refer to following variables: flavor, device, os self.tags = {} if self.rawYamlData["tags"]: for tagLine in self.rawYamlData["tags"]: tagLineSplit = tagLine.split(' ', 1) # splitting tag name from predicate expression tagName = tagLineSplit[0].lower().strip() # using specified python expression (or 'True' if former isn't provided) pythonExpr = tagLineSplit[1] if len(tagLineSplit)==2 else "True" # converting python expression into lambda and doing a smoke test by calling it with dummy parameters predicate = lambda pythonExpr=pythonExpr, **kwargs: eval(pythonExpr, kwargs) try: assert(type(predicate(flavor='foo', device='var', os='foobar')) == bool) except Exception as e: print "Can't parse tag predicate expression in {0} ({1}):\n{2}".format(pathToYmlFile, pythonExpr, e) raise e # saving generated lambda into tags dictionary self.tags[tagName] = predicate # Populates Tests.allTestsIndexedByFullName by scanning directory tree # and finding all testcases.yml files @staticmethod def discoverAllTests(): for dirName, subdirList, fileList in os.walk(thisDir): if 'testcases.yml' in fileList: testDir = dirName testName = os.path.basename(dirName) suiteDir = os.path.dirname(dirName) # sute name will be derived from the path components suiteName = os.path.relpath(suiteDir, thisDir).replace('\\', '/') try: test = Test(suiteName, testName, dirName + "/testcases.yml") Test.allTestsIndexedByFullName[test.fullName.lower()] = test except Exception as e: print >>sys.stderr, "ERROR registering test: " + dirName traceback.print_exc() sys.exit(1) # Runs this test # flavor - "debug" or "release" # device - "cpu" or "gpu" # args - command line arguments from argparse # returns an instance of TestRunResult def run(self, flavor, device, args): # measuring the time of running of the test startTime = time.time() result = self.runImpl(flavor, device, args) result.duration = time.time() - startTime return result def runImpl(self, flavor, device, args): result = TestRunResult() result.succeeded = True # Preparation for pattern-based test cases if len(self.testCases) > 0: # Locating and reading baseline file baselineFile = self.findBaselineFile(flavor, device) if baselineFile == None: return TestRunResult.fatalError("Baseline file sanity check", "Can't find baseline file") with open(baselineFile, "r") as f: baseline = f.read().split("\n") if args.verbose: print "Baseline:", baselineFile # Before running the test, pre-creating TestCaseRunResult object for each test case # and compute filtered lines from baseline file. # Note: some test cases might fail at this time if baseline and/or patterns are inconsistant if not args.update_baseline: for testCase in self.testCases: testCaseRunResult = testCase.processBaseline(baseline) if not testCaseRunResult.succeeded: result.succeeded = False result.testCaseRunResults.append(testCaseRunResult) # preparing run directory runDir = os.path.join(args.run_dir, "{0}_{1}@{2}_{3}".format(self.suite, self.name, flavor, device)) if not os.path.isdir(runDir): os.makedirs(runDir) # preparing environment for the test script os.environ["TEST_FLAVOR"] = flavor os.environ["TEST_DEVICE"] = device os.environ["TEST_BUILD_LOCATION"] = args.build_location if windows: os.environ["TEST_CNTK_BINARY"] = os.path.join(args.build_location, flavor, "cntk.exe") os.environ["MPI_BINARY"] = os.path.join(os.environ["MSMPI_BIN"], "mpiexec.exe") else: os.environ["TEST_CNTK_BINARY"] = os.path.join(args.build_location, flavor, "bin", "cntk") os.environ["MPI_BINARY"] = "mpiexec" os.environ["TEST_DIR"] = self.testDir os.environ["TEST_DATA_DIR"] = self.dataDir os.environ["TEST_RUN_DIR"] = runDir # WORKAROUND: changing current dir to the dataDir so relative paths in SCP files work as expected os.chdir(self.dataDir) # Running test script #TODO:port this properly to windows # Writing standard output to the file and to the console (if --verbose) logFile = os.path.join(runDir, "output.txt") allLines = [] if args.verbose: print self.fullName + ":>" + logFile with open(logFile, "w") as output: cmdLine = ["bash", "-c", self.testDir + "/run-test 2>&1"] process = subprocess.Popen(cmdLine, stdout=subprocess.PIPE) while True: line = process.stdout.readline() if not line: break if len(line)>0 and line[-1]=='\n': line=line[:len(line)-1] if args.verbose: print self.fullName + ": " + line if args.dry_run: print line continue print >>output, line allLines.append(line) output.flush() for testCaseRunResult in result.testCaseRunResults: testCaseRunResult.testCase.processLine(line, testCaseRunResult, args.verbose) exitCode = process.wait() success = True # saving log file path, so it can be reported later result.logFile = logFile # checking exit code if exitCode != 0: if args.dry_run: print "[SKIPPED]" return result else: return TestRunResult.fatalError("Exit code must be 0", "==> got exit code {0} when running: {1}".format(exitCode, " ".join(cmdLine)), logFile = logFile) # finalizing verification - need to check whether we have any unmatched lines for testCaseRunResult in result.testCaseRunResults: testCaseRunResult.testCase.finalize(testCaseRunResult) if not testCaseRunResult.succeeded: result.succeeded = False if (self.testCases)>0 and args.update_baseline and result.succeeded: # When running in --update-baseline mode # verifying that new output is succesfully matching every pattern in the testcases.yml # If this is not the case then baseline update will be rejected for testCase in self.testCases: testCaseRunResult = testCase.processBaseline(allLines) if not testCaseRunResult.succeeded: result.succeeded = False result.testCaseRunResults.append(testCaseRunResult) if result.succeeded: if args.verbose: print "Updating baseline file", baselineFile with open(baselineFile, "w") as f: f.write("\n".join(allLines)) return result # Finds a location of a baseline file by probing different names in the following order: # baseline.$os.$flavor.$device.txt # baseline.$os.$flavor.txt # baseline.$os.$device.txt # baseline.$os.txt # baseline.$flavor.$device.txt # baseline.$flavor.txt # baseline.$device.txt # baseline.txt def findBaselineFile(self, flavor, device): for o in ["." + ("windows" if windows else "linux"), ""]: for f in ["." + flavor.lower(), ""]: for d in ["." + device.lower(), ""]: candidateName = "baseline" + o + f + d + ".txt" fullPath = os.path.join(self.testDir, candidateName) if os.path.isfile(fullPath): return fullPath return None # Checks whether the test matches the specified tag, # returns matched tag name on succes, or None if there is no match(boolean, string) tuple def matchesTag(self, tag, flavor, device, os): tagL = tag.lower() # normalizing the tag for comparison # enumerating all the tags for tag in self.tags.keys(): # match by direct string comparison or by prefix matching rule: # e.g: 'bvt' matches 'bvt' 'bvt-a', 'bvt-b' but not 'bvtx' if tag==tagL or tag.startswith(tagL + "-"): # evaluating tag's predicate if self.tags[tag](flavor=flavor, device=device, os=os): return tag return None # This class encapsulates one testcase (in testcases.yml file) class TestCase: def __init__(self, name, yamlNode): self.name = name self.patterns = [] if "patterns" in yamlNode: for pattern in yamlNode["patterns"]: try: self.patterns.append(TestPattern(pattern)) except Exception as e: print >>sys.stderr, "ERROR registering pattern: " + pattern raise # Processes the baseline file and return an instance of TestCaseRunResult # which is ready to be passed into processLine def processBaseline(self, baseline): result = TestCaseRunResult(self.name, True) result.diagnostics = "" result.testCase = self # filter all lines of baseline file leaving only those which match ALL the patterns filteredLines = [] for line in baseline: if all([p.match(line) for p in self.patterns]): filteredLines.append(line) if len(filteredLines) == 0: result.succeeded = False result.diagnostics+="Baseline file doesn't have any lines matching all patterns defined in the test case.\n"\ "Possible cause: patterns are wrong and/or baseline file doesn't have required line" result.expectedLines = filteredLines return result # Runs this test case and report result into TestCaseRunResult def processLine(self, line, result, verbose): if all([p.match(line) for p in self.patterns]): if len(result.expectedLines) > 0: # we have mathed line in the output and at leat one remaining unmatched in a baseline expected = result.expectedLines[0] # running comparison logic for each pattern failedPatterns = [] for p in self.patterns: if not p.compare(expected, line): result.succeeded = False failedPatterns.append(p) # in the case of failure - reporting mismatched lines if len(failedPatterns)>0: result.diagnostics+=("Baseline: {0}\n"+ "Output: {1}\n" ).format(expected, line) if verbose: print "[FAILED]: Testcase", self.name print "Baseline:", expected # also show all failed patterns for p in failedPatterns: msg = "Failed pattern: " + p.patternText if verbose: print msg result.diagnostics+=msg+"\n" # removing this line, since we already matched it (whether succesfully or not - doesn't matter) del result.expectedLines[0] else: # we have matched line in the output - but don't have any remaining unmatched in a baseline result.succeeded = False result.diagnostics+=("Unexpected (extra) line in the output which matches the pattern, but doesn't appear in baseline file.\n"+ "Extra line: {0}" ).format(line) # called once for each TestCaseRunResult at the end to check for unmatched patterns def finalize(self, result): if len(result.expectedLines) > 0: result.succeeded = False result.diagnostics+=("{0} expected lines weren't observed in the output.\n"+ "First unmatched: {1}" ).format(len(result.expectedLines), result.expectedLines[0]) # This encapsulates parsing and evaluation of a test patterns occurring in testcases.yml file class TestPattern: # maps a type (specified in {{...}} expressions) to a regular expression typeTable = { "integer" : r"\s*-?[0-9]+", "float" : r"\s*-?([0-9]*\.[0-9]+|[0-9]+)(e[+-]?[0-9]+)?" } def __init__(self, patternText): self.patternText = str(patternText) if len(patternText) == 0: raise Exception("Empty pattern") if patternText[0]=='^': patternText = patternText[1:] prefix = "^" else: prefix = ".*?" # After parsing this will be a list of tuples (dataType, tolerance) for each {{...}} section from left to right self.groupInfo = [] # Transforming our pattern into a sigle regular expression # processing {{...}} fragments and escaping all regex special characters self.regexText = prefix + re.sub(r"(\{\{[^}]+\}\}|[\[\]\.\*\+\{\}\(\)\$\^\\\|\?])", self.patternParse, patternText) # Compiling it to perform a check (fail-fast) and for faster matching later self.regex = re.compile(self.regexText) # this is a callback method passed to re.sub call above - it performs the core parsing logic def patternParse(self, match): fragment = match.group(1) if len(fragment) == 1: # this is a spexcial character of regex return "\\" + fragment; else: # parsing {{...}} expressions m = re.match(r"{{(integer|float)(,tolerance=([-0-9\.e]*)(%?))?}}", fragment) dataType = m.group(1) if m.group(3): tolerance = float(m.group(3)) if m.group(4) == "%": # using minus sign to indicate that it is a relative value tolerance = - tolerance/100.0; else: tolerance = 0.0 # saving information about data type and tolerance self.groupInfo.append((dataType, tolerance)) # converting this to regex which mathes specific type # All {{...}} sections are converted to regex groups named as G0, G1, G2... return "(?P{1})".format(len(self.groupInfo)-1, TestPattern.typeTable[dataType]) # Checks wether given line matches this pattern # returns True or False def match(self, line): return self.regex.match(line) != None # Compares a line from baseline log and a line from real output against this pattern # return true or false def compare(self, expected, actual): em = self.regex.match(expected) am = self.regex.match(actual) if em == None and am == None: return True if em == None or am == None: return False for i in range(0, len(self.groupInfo)): dataType, tolerance = self.groupInfo[i] groupId = "G"+str(i) expectedText = em.group(groupId).strip() actualText = am.group(groupId).strip() if dataType=="integer": return int(expectedText) == int(actualText) elif dataType=="float": epsilon = tolerance if tolerance > 0 else abs(float(expectedText)*tolerance) return abs(float(expectedText)-float(actualText)) <= epsilon else: return False; return True class TestRunResult: def __init__(self): self.succeeded = False; self.testCaseRunResults = [] # list of TestCaseRunResult self.duration = -1 @staticmethod def fatalError(name, diagnostics, logFile = None): r = TestRunResult() r.testCaseRunResults.append(TestCaseRunResult(name, False, diagnostics)) r.logFile = logFile return r class TestCaseRunResult: def __init__(self, testCaseName, succeeded, diagnostics = None): self.testCaseName = testCaseName self.succeeded = succeeded self.diagnostics = diagnostics self.expectedLines = [] # list of remaining unmatched expected lines from the baseline file for this test case run # Lists all available tests def listCommand(args): testsByTag = {} for test in Test.allTestsIndexedByFullName.values(): for flavor in args.flavors: for device in args.devices: for os in args.oses: tag = test.matchesTag(args.tag, flavor, device, os) if args.tag else '*' if tag: if tag in testsByTag.keys(): testsByTag[tag].add(test.fullName) else: testsByTag[tag] = sets.Set([test.fullName]) for tag in sorted(testsByTag.keys()): if tag=="*": print ' '.join(sorted(testsByTag[tag])) else: print tag+":", ' '.join(sorted(testsByTag[tag])) # Runs given test(s) or all tests def runCommand(args): if len(args.test) > 0: testsToRun = [] for name in args.test: if name.lower() in Test.allTestsIndexedByFullName: testsToRun.append(Test.allTestsIndexedByFullName[name.lower()]) else: print >>sys.stderr, "ERROR: test not found", name return 1 else: testsToRun = Test.allTestsIndexedByFullName.values() devices = args.devices flavors = args.flavors os.environ["TEST_ROOT_DIR"] = os.path.dirname(os.path.realpath(sys.argv[0])) print "CNTK Test Driver is started" print "Running tests: ", " ".join([y.fullName for y in testsToRun]) print "Build location: ", args.build_location print "Run location: ", args.run_dir print "Flavors: ", " ".join(flavors) print "Devices: ", " ".join(devices) if (args.update_baseline): print "*** Running in automatic baseline update mode ***" print "" if args.dry_run: os.environ["DRY_RUN"] = "1" succeededCount, totalCount = 0, 0 for test in testsToRun: for flavor in flavors: for device in devices: if args.tag and args.tag != '' and not test.matchesTag(args.tag, flavor, device, 'windows' if windows else 'linux'): continue totalCount = totalCount + 1 if len(test.testCases)==0: # forcing verbose mode (showing all output) for all test which are based on exit code (no pattern-based test cases) args.verbose = True # Printing the test which is about to run (without terminating the line) sys.stdout.write("Running test {0} ({1} {2}) - ".format(test.fullName, flavor, device)); if args.dry_run: print "[SKIPPED] (dry-run)" # in verbose mode, terminate the line, since there will be a lot of output if args.verbose: sys.stdout.write("\n"); sys.stdout.flush() # Running the test and collecting a run results result = test.run(flavor, device, args) if args.verbose: # writing the test name one more time (after possibly long verbose output) sys.stdout.write("Test finished {0} ({1} {2}) - ".format(test.fullName, flavor, device)); if result.succeeded: succeededCount = succeededCount + 1 # in no-verbose mode this will be printed in the same line as 'Running test...' print "[OK] {0:.2f} sec".format(result.duration) else: print "[FAILED] {0:.2f} sec".format(result.duration) # Showing per-test-case results: for testCaseRunResult in result.testCaseRunResults: if testCaseRunResult.succeeded: # Printing 'OK' test cases only in verbose mode if (args.verbose): print(" [OK] " + testCaseRunResult.testCaseName); else: # 'FAILED' + detailed diagnostics with proper indendtation print(" [FAILED] " + testCaseRunResult.testCaseName); if testCaseRunResult.diagnostics: for line in testCaseRunResult.diagnostics.split('\n'): print " " + line; # In non-verbose mode log wasn't piped to the stdout, showing log file path for conveniencce if not result.succeeded and not args.verbose and result.logFile: print " See log file for details:", result.logFile if args.update_baseline: print "{0}/{1} baselines updated, {2} failed".format(succeededCount, totalCount, totalCount - succeededCount) else: print "{0}/{1} tests passed, {2} failed".format(succeededCount, totalCount, totalCount - succeededCount) if succeededCount != totalCount: sys.exit(10) # ======================= Entry point ======================= parser = argparse.ArgumentParser(description="TestDriver - CNTK Test Driver") subparsers = parser.add_subparsers(help="command to execute. Run TestDriver.py --help for command-specific help") runSubparser = subparsers.add_parser("run", help="run test(s)") runSubparser.add_argument("test", nargs="*", help="optional test name(s) to run, specified as Suite/TestName. " "Use list command to list available tests. " "If not specified then all tests will be run.") defaultBuildLocation=os.path.realpath(os.path.join(thisDir, "..", "x64" if windows else "build")) runSubparser.add_argument("-b", "--build-location", default=defaultBuildLocation, help="location of the CNTK build to run") runSubparser.add_argument("-t", "--tag", help="runs tests which match the spacified tag") runSubparser.add_argument("-d", "--device", help="cpu|gpu - run on a specified device") runSubparser.add_argument("-f", "--flavor", help="release|debug - run only a specified flavor") tmpDir = os.getenv("TEMP") if windows else "/tmp" defaultRunDir=os.path.join(tmpDir, "cntk-test-{0}.{1}".format(time.strftime("%Y%m%d%H%M%S"), random.randint(0,1000000))) runSubparser.add_argument("-r", "--run-dir", default=defaultRunDir, help="directory where to store test output, default: a random dir within /tmp") runSubparser.add_argument("--update-baseline", action='store_true', help="update baseline file(s) instead of matching them") runSubparser.add_argument("-v", "--verbose", action='store_true', help="verbose output - dump all output of test script") runSubparser.add_argument("-n", "--dry-run", action='store_true', help="do not run the tests, only print test names and configurations to be run along with full command lines") runSubparser.set_defaults(func=runCommand) listSubparser = subparsers.add_parser("list", help="list available tests") listSubparser.add_argument("-t", "--tag", help="limits a resulting list to tests matching the spacified tag") listSubparser.add_argument("-d", "--device", help="cpu|gpu - tests for a specified device") listSubparser.add_argument("-f", "--flavor", help="release|debug - tests for specified flavor") listSubparser.add_argument("--os", help="windows|linux - tests for a specified operating system") listSubparser.set_defaults(func=listCommand) if len(sys.argv)==1: parser.print_help() sys.exit(1) args = parser.parse_args(sys.argv[1:]) # parsing a --device, --flavor and --os options: args.devices = ["cpu", "gpu"] if (args.device): args.device = args.device.lower() if not args.device in args.devices: print >>sys.stderr, "--device must be one of", args.devices sys.exit(1) args.devices = [args.device] args.flavors = ["debug", "release"] if (args.flavor): args.flavor = args.flavor.lower() if not args.flavor in args.flavors: print >>sys.stderr, "--flavor must be one of", args.flavors sys.exit(1) args.flavors = [args.flavor] if args.func == listCommand: args.oses = ["windows", "linux"] if (args.os): args.os = args.os.lower() if not args.os in args.oses: print >>sys.stderr, "--os must be one of", args.oses sys.exit(1) args.oses = [args.os] # discover all the tests Test.discoverAllTests() # execute the command args.func(args)