#!/usr/bin/env python3

"""
An example showing use of nested NVTX markers.
"""

import torch
import torch.nn as nn

import torch.cuda.profiler as profiler
import torch.cuda.nvtx as nvtx
from apex import pyprof
pyprof.nvtx.init()

def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
	"""3x3 convolution with padding"""
	return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
					 padding=dilation, groups=groups, bias=False, dilation=dilation)

def conv1x1(in_planes, out_planes, stride=1):
	"""1x1 convolution"""
	return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

class Bottleneck(nn.Module):
	expansion = 4
	count = 1

	def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
				 base_width=64, dilation=1, norm_layer=None):
		super(Bottleneck, self).__init__()
		if norm_layer is None:
			norm_layer = nn.BatchNorm2d
		width = int(planes * (base_width / 64.)) * groups
		# Both self.conv2 and self.downsample layers downsample the input when stride != 1
		self.conv1 = conv1x1(inplanes, width)
		self.bn1 = norm_layer(width)
		self.conv2 = conv3x3(width, width, stride, groups, dilation)
		self.bn2 = norm_layer(width)
		self.conv3 = conv1x1(width, planes * self.expansion)
		self.bn3 = norm_layer(planes * self.expansion)
		self.relu = nn.ReLU(inplace=True)
		self.downsample = downsample
		self.stride = stride

		self.id = Bottleneck.count
		Bottleneck.count += 1

	def forward(self, x):
		identity = x

		nvtx.range_push("layer:Bottleneck_{}".format(self.id))

		nvtx.range_push("layer:Conv1")
		out = self.conv1(x)
		nvtx.range_pop()

		nvtx.range_push("layer:BN1")
		out = self.bn1(out)
		nvtx.range_pop()

		nvtx.range_push("layer:ReLU")
		out = self.relu(out)
		nvtx.range_pop()

		nvtx.range_push("layer:Conv2")
		out = self.conv2(out)
		nvtx.range_pop()

		nvtx.range_push("layer:BN2")
		out = self.bn2(out)
		nvtx.range_pop()

		nvtx.range_push("layer:ReLU")
		out = self.relu(out)
		nvtx.range_pop()

		nvtx.range_push("layer:Conv3")
		out = self.conv3(out)
		nvtx.range_pop()

		nvtx.range_push("layer:BN3")
		out = self.bn3(out)
		nvtx.range_pop()

		if self.downsample is not None:
			nvtx.range_push("layer:Downsample")
			identity = self.downsample(x)
			nvtx.range_pop()

		nvtx.range_push("layer:Residual")
		out += identity
		nvtx.range_pop()

		nvtx.range_push("layer:ReLU")
		out = self.relu(out)
		nvtx.range_pop()

		nvtx.range_pop()

		return out

class ResNet(nn.Module):

	def __init__(self, block, layers, num_classes=1000,
				 groups=1, width_per_group=64, norm_layer=None):
		super(ResNet, self).__init__()
		if norm_layer is None:
			norm_layer = nn.BatchNorm2d
		self._norm_layer = norm_layer

		self.inplanes = 64
		self.dilation = 1

		self.groups = groups
		self.base_width = width_per_group
		self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
		self.bn1 = norm_layer(self.inplanes)
		self.relu = nn.ReLU(inplace=True)
		self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
		self.layer1 = self._make_layer(block, 64, layers[0])
		self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
		self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
		self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
		self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
		self.fc = nn.Linear(512 * block.expansion, num_classes)

		for m in self.modules():
			if isinstance(m, nn.Conv2d):
				nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
			elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
				nn.init.constant_(m.weight, 1)
				nn.init.constant_(m.bias, 0)

	def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
		norm_layer = self._norm_layer
		downsample = None
		previous_dilation = self.dilation
		if dilate:
			self.dilation *= stride
			stride = 1
		if stride != 1 or self.inplanes != planes * block.expansion:
			downsample = nn.Sequential(
				conv1x1(self.inplanes, planes * block.expansion, stride),
				norm_layer(planes * block.expansion),
			)

		layers = []
		layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
							self.base_width, previous_dilation, norm_layer))
		self.inplanes = planes * block.expansion
		for _ in range(1, blocks):
			layers.append(block(self.inplanes, planes, groups=self.groups,
								base_width=self.base_width, dilation=self.dilation,
								norm_layer=norm_layer))

		return nn.Sequential(*layers)

	def forward(self, x):

		nvtx.range_push("layer:conv1_x")
		x = self.conv1(x)
		x = self.bn1(x)
		x = self.relu(x)
		x = self.maxpool(x)
		nvtx.range_pop()

		nvtx.range_push("layer:conv2_x")
		x = self.layer1(x)
		nvtx.range_pop()

		nvtx.range_push("layer:conv3_x")
		x = self.layer2(x)
		nvtx.range_pop()

		nvtx.range_push("layer:conv4_x")
		x = self.layer3(x)
		nvtx.range_pop()

		nvtx.range_push("layer:conv5_x")
		x = self.layer4(x)
		nvtx.range_pop()

		x = self.avgpool(x)
		x = torch.flatten(x, 1)

		nvtx.range_push("layer:FC")
		x = self.fc(x)
		nvtx.range_pop()

		return x


def resnet50():
	return ResNet(Bottleneck, [3, 4, 6, 3])

#Create model
net = resnet50().cuda().half()
net.train()

#Create optimizer
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(net.parameters(), lr = 0.01, momentum=0.9)

#Create synthetic input and label
x = torch.rand(32, 3, 224, 224).cuda().half()
target = torch.empty(32, dtype=torch.long).random_(1000).cuda()

with torch.autograd.profiler.emit_nvtx():
	profiler.start()
	output = net(x)
	loss = criterion(output, target)
	optimizer.zero_grad()
	loss.backward()
	optimizer.step()
	profiler.stop()