diff --git a/Source/Core/VideoCommon/Src/BPMemory.h b/Source/Core/VideoCommon/Src/BPMemory.h
index cb022893fd..0a62bea162 100644
--- a/Source/Core/VideoCommon/Src/BPMemory.h
+++ b/Source/Core/VideoCommon/Src/BPMemory.h
@@ -451,7 +451,8 @@ union TexMode0
         unsigned mag_filter : 1;
         unsigned min_filter : 3;
         unsigned diag_lod : 1;
-        signed lod_bias : 10;
+        signed lod_bias : 8;
+		unsigned pad0 : 2;
         unsigned max_aniso : 2;
         unsigned lod_clamp : 1;
     };
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.cpp
index 0507503267..b97ad3795d 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.cpp
@@ -90,13 +90,13 @@ namespace Clipper
     static inline int CalcClipMask(OutputVertexData *v)
     {
 	    int cmask = 0;
-        float* pos = v->projectedPosition;
-	    if (pos[3] - pos[0] < 0) cmask |= CLIP_POS_X_BIT;
-	    if (pos[0] + pos[3] < 0) cmask |= CLIP_NEG_X_BIT;
-	    if (pos[3] - pos[1] < 0) cmask |= CLIP_POS_Y_BIT;
-	    if (pos[1] + pos[3] < 0) cmask |= CLIP_NEG_Y_BIT;
-	    if (pos[3] * pos[2] > 0) cmask |= CLIP_POS_Z_BIT;
-	    if (pos[2] + pos[3] < 0) cmask |= CLIP_NEG_Z_BIT;
+        Vec4 pos = v->projectedPosition;
+	    if (pos.w - pos.x < 0) cmask |= CLIP_POS_X_BIT;
+	    if (pos.x + pos.w < 0) cmask |= CLIP_NEG_X_BIT;
+	    if (pos.w - pos.y < 0) cmask |= CLIP_POS_Y_BIT;
+	    if (pos.y + pos.w < 0) cmask |= CLIP_NEG_Y_BIT;
+	    if (pos.w * pos.z > 0) cmask |= CLIP_POS_Z_BIT;
+	    if (pos.z + pos.w < 0) cmask |= CLIP_NEG_Z_BIT;
 	    return cmask;
     }
 
@@ -109,7 +109,7 @@ namespace Clipper
     #define DIFFERENT_SIGNS(x,y) ((x <= 0 && y > 0) || (x > 0 && y <= 0))
 
     #define CLIP_DOTPROD(I, A, B, C, D) \
-	    (Vertices[I]->projectedPosition[0] * A + Vertices[I]->projectedPosition[1] * B + Vertices[I]->projectedPosition[2] * C + Vertices[I]->projectedPosition[3] * D)
+	    (Vertices[I]->projectedPosition.x * A + Vertices[I]->projectedPosition.y * B + Vertices[I]->projectedPosition.z * C + Vertices[I]->projectedPosition.w * D)
 
     #define POLY_CLIP( PLANE_BIT, A, B, C, D )                          \
     {                                                                   \
@@ -153,6 +153,27 @@ namespace Clipper
 	    }									                            \
     }
 
+	#define LINE_CLIP(PLANE_BIT, A, B, C, D )					\
+	{															\
+		if (mask & PLANE_BIT) {									\
+			const float dp0 = CLIP_DOTPROD( 0, A, B, C, D );	\
+			const float dp1 = CLIP_DOTPROD( 1, A, B, C, D );	\
+			const bool neg_dp0 = dp0 < 0;						\
+			const bool neg_dp1 = dp1 < 0;						\
+																\
+			if (neg_dp0 && neg_dp1)								\
+				return;											\
+																\
+			if (neg_dp1) {										\
+				float t = dp1 / (dp1 - dp0);					\
+				if (t > t1) t1 = t;								\
+			} else if (neg_dp0) {								\
+				float t = dp0 / (dp0 - dp1);					\
+				if (t > t0) t0 = t;								\
+			}													\
+		}														\
+	}
+
     void ClipTriangle(int *indices, int &numIndices)
     {
 	    int mask = 0;
@@ -202,6 +223,53 @@ namespace Clipper
 	    }
     }
 
+	void ClipLine(int *indices)
+	{
+		int mask = 0;
+		int clip_mask[2] = { 0, 0 };
+
+		for (int i = 0; i < 2; ++i)
+		{
+			clip_mask[i] = CalcClipMask(Vertices[i]);
+			mask |= clip_mask[i];
+		}
+
+		if (mask == 0) 
+			return;
+
+		float t0 = 0;
+		float t1 = 0;
+
+		// Mark unused in case of early termination 
+		// of the macros below. (When fully clipped)
+		indices[0] = SKIP_FLAG;
+		indices[1] = SKIP_FLAG;
+
+		LINE_CLIP(CLIP_POS_X_BIT, -1,  0,  0, 1);
+		LINE_CLIP(CLIP_NEG_X_BIT,  1,  0,  0, 1);
+		LINE_CLIP(CLIP_POS_Y_BIT,  0, -1,  0, 1);
+		LINE_CLIP(CLIP_NEG_Y_BIT,  0,  1,  0, 1);
+		LINE_CLIP(CLIP_POS_Z_BIT,  0,  0, -1, 1);
+		LINE_CLIP(CLIP_NEG_Z_BIT,  0,  0,  1, 1);
+
+		// Restore the old values as this line 
+		// was not fully clipped.
+		indices[0] = 0;
+		indices[1] = 1;
+
+		int numVertices = 2;
+
+		if (clip_mask[0]) {
+			indices[0] = numVertices;
+			AddInterpolatedVertex(t0, 0, 1, numVertices);
+		}
+
+		if (clip_mask[1]) {
+			indices[1] = numVertices;
+			AddInterpolatedVertex(t1, 1, 0, numVertices);
+		}
+	}
+
     void ProcessTriangle(OutputVertexData *v0, OutputVertexData *v1, OutputVertexData *v2)
     {
         if (stats.thisFrame.numDrawnObjects < g_Config.drawStart || stats.thisFrame.numDrawnObjects >= g_Config.drawEnd )
@@ -247,6 +315,75 @@ namespace Clipper
         }
     }
 
+	void CopyVertex(OutputVertexData *dst, OutputVertexData *src, float dx, float dy, unsigned int sOffset)
+	{
+		dst->screenPosition.x = src->screenPosition.x + dx;
+		dst->screenPosition.y = src->screenPosition.y + dy;
+		dst->screenPosition.z = src->screenPosition.z;
+
+		for (int i = 0; i < 3; ++i)
+			dst->normal[i] = src->normal[i];
+
+		for (int i = 0; i < 4; ++i)
+			dst->color[0][i] = src->color[0][i];
+
+		// todo - s offset
+		for (int i = 0; i < 8; ++i)
+			dst->texCoords[i] = src->texCoords[i];
+	}
+
+	void ProcessLine(OutputVertexData *lineV0, OutputVertexData *lineV1)
+	{
+		int indices[4] = { 0, 1, SKIP_FLAG, SKIP_FLAG };
+
+		Vertices[0] = lineV0;
+        Vertices[1] = lineV1;
+
+		ClipLine(indices);
+
+		if(indices[0] != SKIP_FLAG)
+		{
+			OutputVertexData *v0 = Vertices[indices[0]];
+			OutputVertexData *v1 = Vertices[indices[1]];
+
+			PerspectiveDivide(v0);
+            PerspectiveDivide(v1);
+
+			float dx = v1->screenPosition.x - v0->screenPosition.x;
+			float dy = v1->screenPosition.y - v0->screenPosition.y;
+			
+			float screenDx = 0;
+			float screenDy = 0;
+
+			if(abs(dx) > abs(dy))
+			{
+				if(dx > 0)
+					screenDy = bpmem.lineptwidth.linesize / -12.0f;
+				else
+					screenDy = bpmem.lineptwidth.linesize / 12.0f;
+			}
+			else
+			{
+				if(dy > 0)
+					screenDx = bpmem.lineptwidth.linesize / 12.0f;
+				else
+					screenDx = bpmem.lineptwidth.linesize / -12.0f;
+			}
+
+			OutputVertexData triangle[3];
+
+			CopyVertex(&triangle[0], v0, screenDx, screenDy, 0);
+			CopyVertex(&triangle[1], v1, screenDx, screenDy, 0);
+			CopyVertex(&triangle[2], v1, -screenDx, -screenDy, bpmem.lineptwidth.lineoff);
+
+			// ccw winding
+			Rasterizer::DrawTriangleFrontFace(&triangle[2], &triangle[1], &triangle[0]);
+
+			CopyVertex(&triangle[1], v0, -screenDx, -screenDy, bpmem.lineptwidth.lineoff);
+
+			Rasterizer::DrawTriangleFrontFace(&triangle[0], &triangle[1], &triangle[2]);
+		}
+	}
         
     bool CullTest(OutputVertexData *v0, OutputVertexData *v1, OutputVertexData *v2, bool &backface)
     {
@@ -260,15 +397,15 @@ namespace Clipper
             return false;
         }
 
-        float x0 = v0->projectedPosition[0];
-        float x1 = v1->projectedPosition[0];
-        float x2 = v2->projectedPosition[0];
-        float y1 = v1->projectedPosition[1];
-        float y0 = v0->projectedPosition[1];
-        float y2 = v2->projectedPosition[1];
-        float w0 = v0->projectedPosition[3];
-        float w1 = v1->projectedPosition[3];
-        float w2 = v2->projectedPosition[3];
+        float x0 = v0->projectedPosition.x;
+        float x1 = v1->projectedPosition.x;
+        float x2 = v2->projectedPosition.x;
+        float y1 = v1->projectedPosition.y;
+        float y0 = v0->projectedPosition.y;
+        float y2 = v2->projectedPosition.y;
+        float w0 = v0->projectedPosition.w;
+        float w1 = v1->projectedPosition.w;
+        float w2 = v2->projectedPosition.w;
 
         float normalZDir = (x0*w2 - x2*w0)*y1 + (x2*y0 - x0*y2)*w1 + (y2*w0 - y0*w2)*x1; 
 
@@ -291,13 +428,13 @@ namespace Clipper
 
     void PerspectiveDivide(OutputVertexData *vertex)
     {
-        float *projected = vertex->projectedPosition;
-        float *screen = vertex->screenPosition;
+        Vec4 &projected = vertex->projectedPosition;
+        Vec3 &screen = vertex->screenPosition;
 
-        float wInverse = 1.0f/projected[3];
-        screen[0] = projected[0] * wInverse * xfregs.viewport.wd + m_ViewOffset[0];
-        screen[1] = projected[1] * wInverse * xfregs.viewport.ht + m_ViewOffset[1];
-        screen[2] = projected[2] * wInverse + m_ViewOffset[2];
+        float wInverse = 1.0f/projected.w;
+        screen.x = projected.x * wInverse * xfregs.viewport.wd + m_ViewOffset[0];
+        screen.y = projected.y * wInverse * xfregs.viewport.ht + m_ViewOffset[1];
+        screen.z = projected.z * wInverse + m_ViewOffset[2];
     }
     
 }
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.h b/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.h
index 476b224783..ee9e1d8ebb 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Clipper.h
@@ -31,6 +31,7 @@ namespace Clipper
 
     void ProcessTriangle(OutputVertexData *v0, OutputVertexData *v1, OutputVertexData *v2);
 
+	void ProcessLine(OutputVertexData *v0, OutputVertexData *v1);
 
     bool CullTest(OutputVertexData *v0, OutputVertexData *v1, OutputVertexData *v2, bool &backface);
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.cpp
index 95a73d6721..05cb82ee06 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.cpp
@@ -49,36 +49,32 @@ void Init()
     }
 }
 
-bool SaveTexture(const char* filename, u32 texmap, int width, int height)
-{
-    u8 *data = new u8[width * height * 4];
-    
-    GetTextureBGRA(data, texmap, width, height);
-
-    bool result = SaveTGA(filename, width, height, data);
-
-    delete []data;
-
-    return result;
-}
-
-void SaveTexture(const char* filename, u32 texmap)
+void SaveTexture(const char* filename, u32 texmap, s32 mip)
 {
     FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
     u8 subTexmap = texmap & 3;
 
     TexImage0& ti0 = texUnit.texImage0[subTexmap];
 
-    SaveTexture(filename, texmap, ti0.width + 1, ti0.height + 1);
+	int width = ti0.width + 1;
+	int height = ti0.height + 1;
+
+	u8 *data = new u8[width * height * 4];
+    
+    GetTextureBGRA(data, texmap, mip, width, height);
+
+    bool result = SaveTGA(filename, width, height, data);
+
+    delete []data;
 }
 
-void GetTextureBGRA(u8 *dst, u32 texmap, int width, int height)
+void GetTextureBGRA(u8 *dst, u32 texmap, s32 mip, int width, int height)
 {
     u8 sample[4];    
 
     for (int y = 0; y < height; y++)
         for (int x = 0; x < width; x++) {
-            TextureSampler::Sample((float)x, (float)y, 0, texmap, sample);
+            TextureSampler::SampleMip(x << 7, y << 7, mip, false, texmap, sample);
             // rgba to bgra
             *(dst++) = sample[2];
             *(dst++) = sample[1];
@@ -87,13 +83,32 @@ void GetTextureBGRA(u8 *dst, u32 texmap, int width, int height)
         }
 }
 
+s32 GetMaxTextureLod(u32 texmap)
+{
+	FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
+    u8 subTexmap = texmap & 3;
+
+	u8 maxLod = texUnit.texMode1[subTexmap].max_lod;
+	u8 mip = maxLod >> 4;
+	u8 fract = maxLod & 0xf;
+
+	if(fract)
+		++mip;
+
+	return (s32)mip;
+}
+
 void DumpActiveTextures()
 {
     for (unsigned int stageNum = 0; stageNum < bpmem.genMode.numindstages; stageNum++)
     {
         u32 texmap = bpmem.tevindref.getTexMap(stageNum);
 
-        SaveTexture(StringFromFormat("%star%i_ind%i_map%i.tga", File::GetUserPath(D_DUMPTEXTURES_IDX), stats.thisFrame.numDrawnObjects, stageNum, texmap).c_str(), texmap);     
+		s32 maxLod = GetMaxTextureLod(texmap);
+		for (s32 mip = 0; mip < maxLod; ++mip)
+		{
+			SaveTexture(StringFromFormat("%star%i_ind%i_map%i_mip%i.tga", File::GetUserPath(D_DUMPTEXTURES_IDX), stats.thisFrame.numDrawnObjects, stageNum, texmap, mip).c_str(), texmap, mip);
+		}
     }
 
     for (unsigned int stageNum = 0; stageNum <= bpmem.genMode.numtevstages; stageNum++)
@@ -104,7 +119,11 @@ void DumpActiveTextures()
 
         int texmap = order.getTexMap(stageOdd);
 
-        SaveTexture(StringFromFormat("%star%i_stage%i_map%i.tga", File::GetUserPath(D_DUMPTEXTURES_IDX), stats.thisFrame.numDrawnObjects, stageNum, texmap).c_str(), texmap);           
+        s32 maxLod = GetMaxTextureLod(texmap);
+		for (s32 mip = 0; mip < maxLod; ++mip)
+		{
+			SaveTexture(StringFromFormat("%star%i_stage%i_map%i_mip%i.tga", File::GetUserPath(D_DUMPTEXTURES_IDX), stats.thisFrame.numDrawnObjects, stageNum, texmap, mip).c_str(), texmap, mip);
+		}
     }
 }
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.h b/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.h
index c03d291f66..d40a4dc3e7 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/DebugUtil.h
@@ -22,7 +22,7 @@ namespace DebugUtil
 {
     void Init();
 
-    void GetTextureBGRA(u8 *dst, u32 texmap, int width, int height);
+    void GetTextureBGRA(u8 *dst, u32 texmap, s32 mip, int width, int height);
 
     void DumpActiveTextures();
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/HwRasterizer.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/HwRasterizer.cpp
index ab70b0fa96..5975e0967b 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/HwRasterizer.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/HwRasterizer.cpp
@@ -155,7 +155,7 @@ namespace HwRasterizer
         int width = texImage0.width;
         int height = texImage0.height;
 
-        DebugUtil::GetTextureBGRA(temp, 0, width, height);
+        DebugUtil::GetTextureBGRA(temp, 0, 0, width, height);
 
         glGenTextures(1, (GLuint *)&texture);
 		glBindTexture(GL_TEXTURE_RECTANGLE_ARB, texture);
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/NativeVertexFormat.h b/Source/Plugins/Plugin_VideoSoftware/Src/NativeVertexFormat.h
index befc048f37..a4e9af8dad 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/NativeVertexFormat.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/NativeVertexFormat.h
@@ -18,6 +18,8 @@
 #ifndef _NATIVEVERTEXFORMAT_H
 #define _NATIVEVERTEXFORMAT_H
 
+#include "../../Plugin_VideoDX9/Src/Vec3.h"
+
 #ifdef WIN32
 #define LOADERDECL __cdecl
 #else
@@ -26,25 +28,33 @@
 
 typedef void (LOADERDECL *TPipelineFunction)();
 
+struct Vec4
+{
+	float x;
+	float y;
+	float z;
+	float w;
+};
+
 struct InputVertexData
 {
     u8 posMtx;
     u8 texMtx[8];
 
-    float position[4];    
-    float normal[3][3];
+    Vec3 position;    
+    Vec3 normal[3];
     u8 color[2][4];
     float texCoords[8][2];
 };
 
 struct OutputVertexData
 {
-    float mvPosition[3];
-    float projectedPosition[4];
-    float screenPosition[3];
-    float normal[3][3];
+    Vec3 mvPosition;
+    Vec4 projectedPosition;
+    Vec3 screenPosition;
+    Vec3 normal[3];
     u8 color[2][4];
-    float texCoords[8][3];
+    Vec3 texCoords[8];
 
     void Lerp(float t, OutputVertexData *a, OutputVertexData *b)
     {
@@ -52,17 +62,16 @@ struct OutputVertexData
 
         #define LINTERP_INT(T, OUT, IN) (OUT) + (((IN - OUT) * T) >> 8)
 
-        for (int i = 0; i < 3; ++i)
-            mvPosition[i] = LINTERP(t, a->mvPosition[i], b->mvPosition[i]);
+        mvPosition = LINTERP(t, a->mvPosition, b->mvPosition);
 
-        for (int i = 0; i < 4; ++i)
-            projectedPosition[i] = LINTERP(t, a->projectedPosition[i], b->projectedPosition[i]);
+        projectedPosition.x = LINTERP(t, a->projectedPosition.x, b->projectedPosition.x);
+		projectedPosition.y = LINTERP(t, a->projectedPosition.y, b->projectedPosition.y);
+		projectedPosition.z = LINTERP(t, a->projectedPosition.z, b->projectedPosition.z);
+		projectedPosition.w = LINTERP(t, a->projectedPosition.w, b->projectedPosition.w);
 
         for (int i = 0; i < 3; ++i)
         {
-            normal[i][0] = LINTERP(t, a->normal[i][0], b->normal[i][0]);
-            normal[i][1] = LINTERP(t, a->normal[i][1], b->normal[i][1]);
-            normal[i][2] = LINTERP(t, a->normal[i][2], b->normal[i][2]);
+            normal[i] = LINTERP(t, a->normal[i], b->normal[i]);
         }
 
         u16 t_int = (u16)(t * 256);
@@ -74,9 +83,7 @@ struct OutputVertexData
 
         for (int i = 0; i < 8; ++i)
         {
-            texCoords[i][0] = LINTERP(t, a->texCoords[i][0], b->texCoords[i][0]);
-            texCoords[i][1] = LINTERP(t, a->texCoords[i][1], b->texCoords[i][1]);
-            texCoords[i][2] = LINTERP(t, a->texCoords[i][2], b->texCoords[i][2]);
+            texCoords[i] = LINTERP(t, a->texCoords[i], b->texCoords[i]);
         }
 
         #undef LINTERP
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp
index cf85d494ac..22ecb1c6d5 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.cpp
@@ -27,8 +27,20 @@
 #include "VideoConfig.h"
 
 
-#define BLOCK_SIZE 8
+#define BLOCK_SIZE 2
 
+#define CLAMP(x, a, b) (x>b)?b:(x<a)?a:x
+
+// returns approximation of log2(f) in s28.4
+// results are close enough to use for LOD
+static inline s32 FixedLog2(float f)
+{
+	u32 *x = (u32*)&f;
+	s32 logInt = ((*x & 0x7F800000) >> 19) - 2032; // integer part
+	s32 logFract = (*x & 0x007fffff) >> 19; // approximate fractional part
+
+	return logInt + logFract;
+}
 
 namespace Rasterizer
 {
@@ -43,6 +55,7 @@ s32 scissorRight = 0;
 s32 scissorBottom = 0;
 
 Tev tev;
+RasterBlock rasterBlock;
 
 void Init()
 {
@@ -91,53 +104,58 @@ void SetTevReg(int reg, int comp, bool konst, s16 color)
     tev.SetRegColor(reg, comp, konst, color);
 }
 
-inline void Draw(s32 x, s32 y)
+inline void Draw(s32 x, s32 y, s32 xi, s32 yi)
 {
     INCSTAT(stats.thisFrame.rasterizedPixels);
 
-    float zFloat = 1.0f + ZSlope.GetValue(x, y);
-    if(zFloat < 0|| zFloat > 1)
-        return;
+	float zFloat = 1.0f + ZSlope.GetValue(x, y);
+	if (zFloat < 0.0f || zFloat > 1.0f)
+		return;
 
-    u32 z = (u32)(zFloat * 0x00ffffff);
+	s32 z = (s32)(zFloat * 0x00ffffff);
 
-    if (bpmem.zcontrol.zcomploc && bpmem.zmode.testenable)
-    {
-        // early z
-        if (!EfbInterface::ZCompare(x, y, z))
-            return;
-    }
+	if (bpmem.zcontrol.zcomploc && bpmem.zmode.testenable)
+	{
+		// early z
+		if (!EfbInterface::ZCompare(x, y, z))
+			return;
+	}
 
-    float invW = 1.0f / WSlope.GetValue(x, y);
+	RasterBlockPixel& pixel = rasterBlock.Pixel[xi][yi];
 
-    tev.Position[0] = x;
-    tev.Position[1] = y;
-    tev.Position[2] = z;
+	float invW = pixel.InvW;
 
-    for(unsigned int i = 0; i < bpmem.genMode.numcolchans; i++)
-    {
-        for(int comp = 0; comp < 4; comp++)
-            tev.Color[i][comp] = (u8)ColorSlopes[i][comp].GetValue(x, y);
-    }
+	tev.Position[0] = x;
+	tev.Position[1] = y;
+	tev.Position[2] = z;
 
-    for(unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
-    {
-        if (xfregs.texMtxInfo[i].projection)
-        {
-            float q = TexSlopes[i][2].GetValue(x, y) * invW;
-            float invQ = invW / q;
-            tev.Uv[i][0] = TexSlopes[i][0].GetValue(x, y) * invQ * (bpmem.texcoords[i].s.scale_minus_1 + 1);
-            tev.Uv[i][1] = TexSlopes[i][1].GetValue(x, y) * invQ * (bpmem.texcoords[i].t.scale_minus_1 + 1);
-            tev.Lod[i] = 0;
-        }
-        else
-        {
-            tev.Uv[i][0] = TexSlopes[i][0].GetValue(x, y) * invW * (bpmem.texcoords[i].s.scale_minus_1 + 1);
-            tev.Uv[i][1] = TexSlopes[i][1].GetValue(x, y) * invW * (bpmem.texcoords[i].t.scale_minus_1 + 1);
-            tev.Lod[i] = 0;
-        }
-    }
+	//  colors
+	for (unsigned int i = 0; i < bpmem.genMode.numcolchans; i++)
+	{
+		for(int comp = 0; comp < 4; comp++)
+			tev.Color[i][comp] = (u8)ColorSlopes[i][comp].GetValue(x, y);
+	}
 
+	// tex coords
+	for (unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
+	{
+		// multiply by 128 because TEV stores stores UVs as s17.7
+		tev.Uv[i].s = (s32)(pixel.Uv[i][0] * 128);
+		tev.Uv[i].t = (s32)(pixel.Uv[i][1] * 128);
+	}
+
+	for (unsigned int i = 0; i < bpmem.genMode.numindstages; i++)
+	{
+		tev.IndirectLod[i] = rasterBlock.IndirectLod[i];
+		tev.IndirectLinear[i] = rasterBlock.IndirectLinear[i];
+	}
+
+	for (unsigned int i = 0; i <= bpmem.genMode.numtevstages; i++)
+	{
+		tev.TextureLod[i] = rasterBlock.TextureLod[i];
+		tev.TextureLinear[i] = rasterBlock.TextureLinear[i];
+	}
+   
     tev.Draw();
 }
 
@@ -155,6 +173,109 @@ void InitSlope(Slope *slope, float f1, float f2, float f3, float DX31, float DX1
     slope->y0 = Y1;
 }
 
+inline void CalculateLOD(s32 &lod, bool &linear, u32 texmap, u32 texcoord)
+{
+	FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
+	u8 subTexmap = texmap & 3;
+
+	// LOD calculation requires data from the texture mode for bias, etc.
+	// it does not seem to use the actual texture size
+	TexMode0& tm0 = texUnit.texMode0[subTexmap];
+	TexMode1& tm1 = texUnit.texMode1[subTexmap];
+
+	float sDelta, tDelta;
+	if (tm0.diag_lod)
+	{
+		float *uv0 = rasterBlock.Pixel[0][0].Uv[texcoord];
+		float *uv1 = rasterBlock.Pixel[1][1].Uv[texcoord];
+
+		sDelta = abs(uv0[0] - uv1[0]);
+		tDelta = abs(uv0[1] - uv1[1]);
+	}
+	else
+	{
+		float *uv0 = rasterBlock.Pixel[0][0].Uv[texcoord];
+		float *uv1 = rasterBlock.Pixel[1][0].Uv[texcoord];
+		float *uv2 = rasterBlock.Pixel[0][1].Uv[texcoord];
+
+		sDelta = max(abs(uv0[0] - uv1[0]), abs(uv0[0] - uv2[0]));
+		tDelta = max(abs(uv0[1] - uv1[1]), abs(uv0[1] - uv2[1]));
+	}
+
+	// get LOD in s28.4
+	lod = FixedLog2(max(sDelta, tDelta));
+
+	// bias is s2.5
+	int bias = tm0.lod_bias;
+	bias >>= 1;
+	lod += bias;
+
+	linear = (lod >= 0 && (tm0.min_filter & 4) || lod < 0 && tm0.mag_filter);
+
+	// order of checks matters
+	// should be:
+	// if lod > max then max
+	// else if lod < min then min
+	lod = CLAMP(lod, (s32)tm1.min_lod, (s32)tm1.max_lod);
+}
+
+void BuildBlock(s32 blockX, s32 blockY)
+{
+	for (s32 yi = 0; yi < BLOCK_SIZE; yi++)
+	{
+		for (s32 xi = 0; xi < BLOCK_SIZE; xi++)
+		{
+			RasterBlockPixel& pixel = rasterBlock.Pixel[xi][yi];
+
+			s32 x = xi + blockX;
+			s32 y = yi + blockY;			
+
+			float invW = 1.0f / WSlope.GetValue(x, y);
+			pixel.InvW = invW;
+
+			// tex coords
+			for (unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
+			{
+				float projection;
+				if (xfregs.texMtxInfo[i].projection)
+				{
+					float q = TexSlopes[i][2].GetValue(x, y) * invW;
+					projection = invW / q;
+				}
+				else
+					projection = invW;
+
+				pixel.Uv[i][0] = TexSlopes[i][0].GetValue(x, y) * projection;
+				pixel.Uv[i][1] = TexSlopes[i][1].GetValue(x, y) * projection;
+			}
+		}
+	}
+
+	u32 indref = bpmem.tevindref.hex;
+	for (unsigned int i = 0; i < bpmem.genMode.numindstages; i++)
+	{
+		u32 texmap = indref & 3;
+		indref >>= 3;
+		u32 texcoord = indref & 3;
+		indref >>= 3;
+
+		CalculateLOD(rasterBlock.IndirectLod[i], rasterBlock.IndirectLinear[i], texmap, texcoord);
+	}
+
+	for (unsigned int i = 0; i <= bpmem.genMode.numtevstages; i++)
+	{
+		int stageOdd = i&1;
+		TwoTevStageOrders &order = bpmem.tevorders[i >> 1];
+		if(order.getEnable(stageOdd))
+		{
+			u32 texmap = order.getTexMap(stageOdd);
+			u32 texcoord = order.getTexCoord(stageOdd);
+
+			CalculateLOD(rasterBlock.TextureLod[i], rasterBlock.TextureLinear[i], texmap, texcoord);
+		}
+	}
+}
+
 void DrawTriangleFrontFace(OutputVertexData *v0, OutputVertexData *v1, OutputVertexData *v2)
 {
     INCSTAT(stats.thisFrame.numTrianglesDrawn);
@@ -217,7 +338,7 @@ void DrawTriangleFrontFace(OutputVertexData *v0, OutputVertexData *v1, OutputVer
     float fltdy12 = flty1 - v1->screenPosition[1];
     float fltdy31 = v2->screenPosition[1] - flty1;
 
-    float w[3] = { 1.0f / v0->projectedPosition[3], 1.0f / v1->projectedPosition[3], 1.0f / v2->projectedPosition[3] };
+    float w[3] = { 1.0f / v0->projectedPosition.w, 1.0f / v1->projectedPosition.w, 1.0f / v2->projectedPosition.w };
     InitSlope(&WSlope, w[0], w[1], w[2], fltdx31, fltdx12, fltdy12, fltdy31, fltx1, flty1);
 
     InitSlope(&ZSlope, v0->screenPosition[2], v1->screenPosition[2], v2->screenPosition[2], fltdx31, fltdx12, fltdy12, fltdy31, fltx1, flty1);
@@ -281,14 +402,16 @@ void DrawTriangleFrontFace(OutputVertexData *v0, OutputVertexData *v1, OutputVer
             // Skip block when outside an edge
             if(a == 0x0 || b == 0x0 || c == 0x0) continue;
 
+			BuildBlock(x, y);
+
             // Accept whole block when totally covered
             if(a == 0xF && b == 0xF && c == 0xF)
             {
                 for(s32 iy = 0; iy < BLOCK_SIZE; iy++)
                 {
-                    for(s32 ix = x; ix < x + BLOCK_SIZE; ix++)
+                    for(s32 ix = 0; ix < BLOCK_SIZE; ix++)
                     {                        
-                        Draw(ix, iy + y);
+                        Draw(x + ix, y + iy, ix, iy);
                     }
                 }
             }
@@ -298,17 +421,17 @@ void DrawTriangleFrontFace(OutputVertexData *v0, OutputVertexData *v1, OutputVer
                 s32 CY2 = C2 + DX23 * y0 - DY23 * x0;
                 s32 CY3 = C3 + DX31 * y0 - DY31 * x0;
 
-                for(s32 iy = y; iy < y + BLOCK_SIZE; iy++)
+                for(s32 iy = 0; iy < BLOCK_SIZE; iy++)
                 {
                     s32 CX1 = CY1;
                     s32 CX2 = CY2;
                     s32 CX3 = CY3;
 
-                    for(s32 ix = x; ix < x + BLOCK_SIZE; ix++)
+                    for(s32 ix = 0; ix < BLOCK_SIZE; ix++)
                     {
                         if(CX1 > 0 && CX2 > 0 && CX3 > 0)
                         {
-                            Draw(ix, iy);
+                            Draw(x + ix, y + iy, ix, iy);
                         }
 
                         CX1 -= FDY12;
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.h b/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.h
index 33c152703e..403b0459ba 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Rasterizer.h
@@ -39,6 +39,21 @@ namespace Rasterizer
         float y0;
         float GetValue(s32 x, s32 y) { return f0 + (dfdx * (x - x0)) + (dfdy * (y - y0)); }
     };
+
+	struct RasterBlockPixel
+	{
+		float InvW;
+		float Uv[8][2];
+	};
+
+	struct RasterBlock
+	{
+		RasterBlockPixel Pixel[2][2];
+		s32 IndirectLod[4];
+		bool IndirectLinear[4];
+		s32 TextureLod[16];
+		bool TextureLinear[16];
+	};
     
 }
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/SetupUnit.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/SetupUnit.cpp
index de28989972..6bc92dc071 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/SetupUnit.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/SetupUnit.cpp
@@ -134,10 +134,38 @@ void SetupUnit::SetupTriFan()
 }
 
 void SetupUnit::SetupLine()
-{}
+{
+	if (m_VertexCounter < 1)
+    {
+        m_VertexCounter++;
+        m_VertWritePointer = m_VertPointer[m_VertexCounter];
+        return;
+    }
+
+    Clipper::ProcessLine(m_VertPointer[0], m_VertPointer[1]);
+
+    m_VertexCounter = 0;
+    m_VertWritePointer = m_VertPointer[0];
+}
 
 void SetupUnit::SetupLineStrip()
-{}
+{
+	if (m_VertexCounter < 1)
+    {
+        m_VertexCounter++;
+		m_VertWritePointer = m_VertPointer[m_VertexCounter];
+        return;
+    }
+
+	m_VertexCounter++;
+
+    Clipper::ProcessLine(m_VertPointer[0], m_VertPointer[1]);
+
+	m_VertWritePointer = m_VertPointer[0];
+
+	m_VertPointer[0] = m_VertPointer[1];
+	m_VertPointer[1] = &m_Vertices[m_VertexCounter & 1];
+}
 
 void SetupUnit::SetupPoint()
 {}
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp
index 83c095c972..680806e85a 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Tev.cpp
@@ -439,34 +439,33 @@ static bool AlphaTest(int alpha)
     return true;
 }
 
-inline float WrapIndirectCoord(float coord, int wrapMode)
+inline s32 WrapIndirectCoord(s32 coord, int wrapMode)
 {
     switch (wrapMode) {
         case ITW_OFF:
             return coord;
         case ITW_256:
-            return fmod(coord, 256);
-         case ITW_128:
-            return fmod(coord, 128);
+            return (coord % (256 << 7));
+        case ITW_128:
+            return (coord % (128 << 7));
         case ITW_64:
-            return fmod(coord, 64);
+            return (coord % (64 << 7));
         case ITW_32:
-            return fmod(coord, 32);
+            return (coord % (32 << 7));
         case ITW_16:
-            return fmod(coord, 16);
+            return (coord % (16 << 7));
         case ITW_0:
             return 0;
     }
     return 0;
 }
 
-void Tev::Indirect(unsigned int stageNum, float s, float t)
+void Tev::Indirect(unsigned int stageNum, s32 s, s32 t)
 {
     TevStageIndirect &indirect = bpmem.tevind[stageNum];
     u8 *indmap = IndirectTex[indirect.bt];
-    
 
-    float indcoord[3];
+    s32 indcoord[3];
 
     // alpha bump select
     switch (indirect.bs) {
@@ -494,32 +493,32 @@ void Tev::Indirect(unsigned int stageNum, float s, float t)
     // format
     switch(indirect.fmt) {
         case ITF_8:
-            indcoord[0] = (float)indmap[ALP_C] + bias[0];
-            indcoord[1] = (float)indmap[BLU_C] + bias[1];
-            indcoord[2] = (float)indmap[GRN_C] + bias[2];
+            indcoord[0] = indmap[ALP_C] + bias[0];
+            indcoord[1] = indmap[BLU_C] + bias[1];
+            indcoord[2] = indmap[GRN_C] + bias[2];
             AlphaBump = AlphaBump & 0xf8;
             break;
         case ITF_5:
-            indcoord[0] = (float)(indmap[ALP_C] & 0x1f) + bias[0];
-            indcoord[1] = (float)(indmap[BLU_C] & 0x1f) + bias[1];
-            indcoord[2] = (float)(indmap[GRN_C] & 0x1f) + bias[2];
+            indcoord[0] = (indmap[ALP_C] & 0x1f) + bias[0];
+            indcoord[1] = (indmap[BLU_C] & 0x1f) + bias[1];
+            indcoord[2] = (indmap[GRN_C] & 0x1f) + bias[2];
             AlphaBump = AlphaBump & 0xe0;
             break;
         case ITF_4:
-            indcoord[0] = (float)(indmap[ALP_C] & 0x0f) + bias[0];
-            indcoord[1] = (float)(indmap[BLU_C] & 0x0f) + bias[1];
-            indcoord[2] = (float)(indmap[GRN_C] & 0x0f) + bias[2];
+            indcoord[0] = (indmap[ALP_C] & 0x0f) + bias[0];
+            indcoord[1] = (indmap[BLU_C] & 0x0f) + bias[1];
+            indcoord[2] = (indmap[GRN_C] & 0x0f) + bias[2];
             AlphaBump = AlphaBump & 0xf0;
             break;
         case ITF_3:
-            indcoord[0] = (float)(indmap[ALP_C] & 0x07) + bias[0];
-            indcoord[1] = (float)(indmap[BLU_C] & 0x07) + bias[1];
-            indcoord[2] = (float)(indmap[GRN_C] & 0x07) + bias[2];
+            indcoord[0] = (indmap[ALP_C] & 0x07) + bias[0];
+            indcoord[1] = (indmap[BLU_C] & 0x07) + bias[1];
+            indcoord[2] = (indmap[GRN_C] & 0x07) + bias[2];
             AlphaBump = AlphaBump & 0xf8;
             break;
     }
 
-    float indtevtrans[2] = { 0,0 };
+    s64 indtevtrans[2] = { 0,0 };
 
     // matrix multiply
     int indmtxid = indirect.mid & 3;
@@ -529,39 +528,40 @@ void Tev::Indirect(unsigned int stageNum, float s, float t)
         int scale = ((u32)indmtx.col0.s0 << 0) |
 	                ((u32)indmtx.col1.s1 << 2) |
 	                ((u32)indmtx.col2.s2 << 4);
-        float fscale = 0.0f;
+
+		int shift;
 
         switch (indirect.mid & 12) {
-            case 0:
-                fscale = powf(2.0f, (float)(scale - 17)) / 1024.0f;
+            case 0:   
+				shift = 3 + (17 - scale);
                 indtevtrans[0] = indmtx.col0.ma * indcoord[0] + indmtx.col1.mc * indcoord[1] + indmtx.col2.me * indcoord[2];
                 indtevtrans[1] = indmtx.col0.mb * indcoord[0] + indmtx.col1.md * indcoord[1] + indmtx.col2.mf * indcoord[2];
                 break;
             case 4: // s matrix
-                fscale = powf(2.0f, (float)(scale - 17)) / 256;
+				shift = 8 + (17 - scale);
                 indtevtrans[0] = s * indcoord[0];
                 indtevtrans[1] = t * indcoord[0];
                 break;
             case 8: // t matrix
-                fscale = powf(2.0f, (float)(scale - 17)) / 256;
+				shift = 8 + (17 - scale);
                 indtevtrans[0] = s * indcoord[1];
                 indtevtrans[1] = t * indcoord[1];
                 break;
         }
 
-        indtevtrans[0] *= fscale;
-        indtevtrans[1] *= fscale;
+		indtevtrans[0] = shift >= 0 ? indtevtrans[0] >> shift : indtevtrans[0] << -shift;
+		indtevtrans[1] = shift >= 0 ? indtevtrans[1] >> shift : indtevtrans[1] << -shift;
     }
 
-    if (indirect.fb_addprev)
+	if (indirect.fb_addprev)
     {
-        TexCoord[0] += WrapIndirectCoord(s, indirect.sw) + indtevtrans[0];
-        TexCoord[1] += WrapIndirectCoord(t, indirect.tw) + indtevtrans[1];
+        TexCoord.s += (int)(WrapIndirectCoord(s, indirect.sw) + indtevtrans[0]);
+        TexCoord.t += (int)(WrapIndirectCoord(t, indirect.tw) + indtevtrans[1]);
     }
     else
     {
-        TexCoord[0] = WrapIndirectCoord(s, indirect.sw) + indtevtrans[0];
-        TexCoord[1] = WrapIndirectCoord(t, indirect.tw) + indtevtrans[1];
+        TexCoord.s = (int)(WrapIndirectCoord(s, indirect.sw) + indtevtrans[0]);
+        TexCoord.t = (int)(WrapIndirectCoord(t, indirect.tw) + indtevtrans[1]);
     }
 }
 
@@ -580,10 +580,12 @@ void Tev::Draw()
         u32 texcoordSel = bpmem.tevindref.getTexCoord(stageNum);
         u32 texmap = bpmem.tevindref.getTexMap(stageNum);
 
-        float scaleS = bpmem.texscale[stageNum2].getScaleS(stageOdd);
-        float scaleT = bpmem.texscale[stageNum2].getScaleT(stageOdd);
+		const TEXSCALE& texscale = bpmem.texscale[stageNum2];
+		s32 scaleS = stageOdd ? texscale.ss1:texscale.ss0;
+        s32 scaleT = stageOdd ? texscale.ts1:texscale.ts0;
 
-        TextureSampler::Sample(Uv[texcoordSel][0] * scaleS, Uv[texcoordSel][1] * scaleT, Lod[texcoordSel], texmap, IndirectTex[stageNum]);
+        TextureSampler::Sample(Uv[texcoordSel].s >> scaleS, Uv[texcoordSel].t >> scaleT,
+			IndirectLod[stageNum], IndirectLinear[stageNum], texmap, IndirectTex[stageNum]);
 
 #ifdef _DEBUG
         if (g_Config.bDumpTevStages)
@@ -608,14 +610,14 @@ void Tev::Draw()
         int texcoordSel = order.getTexCoord(stageOdd);
         int texmap = order.getTexMap(stageOdd);
 
-        Indirect(stageNum, Uv[texcoordSel][0], Uv[texcoordSel][1]);
+        Indirect(stageNum, Uv[texcoordSel].s, Uv[texcoordSel].t);
 
         // sample texture
         if (order.getEnable(stageOdd))
         {
             u8 texel[4];
     
-            TextureSampler::Sample(TexCoord[0], TexCoord[1], Lod[texcoordSel], texmap, texel);
+			TextureSampler::Sample(TexCoord.s, TexCoord.t, TextureLod[stageNum], TextureLinear[stageNum], texmap, texel);
 
             int swaptable = ac.tswap * 2;            
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/Tev.h b/Source/Plugins/Plugin_VideoSoftware/Src/Tev.h
index 0419df8086..caaa88cfc9 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/Tev.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/Tev.h
@@ -21,7 +21,20 @@
 #include "BPMemLoader.h"
 
 class Tev
-{
+{ 
+	struct InputRegType {
+        unsigned a : 8;
+        unsigned b : 8;
+        unsigned c : 8;
+        signed   d : 11;
+    };
+
+	struct TextureCoordinateType
+	{
+		signed s : 24;
+		signed t : 24;
+	};
+
     // color order: RGBA
     s16 Reg[4][4];    
     s16 KonstantColors[4][4];
@@ -32,7 +45,7 @@ class Tev
     s16 Zero16[4];
     u8 AlphaBump;
     u8 IndirectTex[4][4];
-    float TexCoord[2];
+	TextureCoordinateType TexCoord;
 
     s16 *m_ColorInputLUT[16][3];
     s16 *m_AlphaInputLUT[8];        // values must point to RGBA color
@@ -49,20 +62,16 @@ class Tev
     void DrawAlphaRegular(TevStageCombiner::AlphaCombiner &ac);
     void DrawAlphaCompare(TevStageCombiner::AlphaCombiner &ac);
 
-    void Indirect(unsigned int stageNum, float s, float t);    
-
-    struct InputRegType {
-        unsigned a : 8;
-        unsigned b : 8;
-        unsigned c : 8;
-        signed   d : 11;
-    };
+    void Indirect(unsigned int stageNum, s32 s, s32 t);
 
 public:
-    s32 Position[3];
+	s32 Position[3];
     u8 Color[2][4];
-    float Uv[8][2];
-    float Lod[8];
+    TextureCoordinateType Uv[8];
+    s32 IndirectLod[4];
+	bool IndirectLinear[4];
+	s32 TextureLod[16];
+	bool TextureLinear[16];
 
     void Init();
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.cpp
index e7001a537b..44878e262b 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.cpp
@@ -23,29 +23,11 @@
 
 #include <cmath>
 
+#define ALLOW_MIPMAP 1
+
 namespace TextureSampler
 {
 
-inline int iround(float x)
-{
-    int t;
-
-#if defined(_WIN32) && !defined(_M_X64)
-    __asm
-    {
-        fld  x
-        fistp t
-    }
-#else
-	t = (int)x;
-	if((x - t) >= 0.5)
-		return t + 1;
-#endif
-
-    return t;
-}
-
-
 inline void WrapCoord(int &coord, int wrapMode, int imageSize)
 {
     switch (wrapMode)
@@ -85,9 +67,53 @@ inline void AddTexel(u8 *inTexel, u32 *outTexel, u32 fract)
     outTexel[3] += inTexel[3] * fract;
 }
 
-void Sample(float s, float t, float lod, u8 texmap, u8 *sample)
+void Sample(s32 s, s32 t, s32 lod, bool linear, u8 texmap, u8 *sample)
 {
-    FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
+    int baseMip = 0;
+	bool mipLinear = false;
+
+#if (ALLOW_MIPMAP)
+	FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
+    TexMode0& tm0 = texUnit.texMode0[texmap & 3];
+
+	s32 lodFract = lod & 0xf;
+
+	if (lod > 0 && tm0.min_filter & 3)
+	{
+		// use mipmap
+		baseMip = lod >> 4;
+		mipLinear = (lodFract && tm0.min_filter & 2);
+
+		// if using nearest mip filter and lodFract >= 0.5 round up to next mip
+		baseMip += (lodFract >> 3) & (tm0.min_filter & 1);
+	}
+
+	if (mipLinear)
+	{
+		u8 sampledTex[4];
+        u32 texel[4];
+
+		SampleMip(s, t, baseMip, linear, texmap, sampledTex);
+		SetTexel(sampledTex, texel, (16 - lodFract));
+
+		SampleMip(s, t, baseMip + 1, linear, texmap, sampledTex);
+		AddTexel(sampledTex, texel, lodFract);
+
+		sample[0] = (u8)(texel[0] >> 4);
+        sample[1] = (u8)(texel[1] >> 4);
+        sample[2] = (u8)(texel[2] >> 4);
+        sample[3] = (u8)(texel[3] >> 4);
+	}
+	else
+#endif
+	{
+		SampleMip(s, t, baseMip, linear, texmap, sample);
+	}	
+}
+
+void SampleMip(s32 s, s32 t, s32 mip, bool linear, u8 texmap, u8 *sample)
+{
+	FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
     u8 subTexmap = texmap & 3;
 
     TexMode0& tm0 = texUnit.texMode0[subTexmap];
@@ -97,59 +123,85 @@ void Sample(float s, float t, float lod, u8 texmap, u8 *sample)
     u32 imageBase = texUnit.texImage3[subTexmap].image_base << 5;    
     u8 *imageSrc = g_VideoInitialize.pGetMemoryPointer(imageBase);
 
-    bool linear = false;
-    if ((lod > 0 && tm0.min_filter > 4) || (lod <= 0 && tm0.mag_filter))
-        linear = true;
+	int imageWidth = ti0.width;
+	int imageHeight = ti0.height;
+
+	int tlutAddress = texTlut.tmem_offset << 9;
+	
+	// reduce sample location and texture size to mip level
+	// move texture pointer to mip location
+	if (mip)
+	{
+		int mipWidth = imageWidth + 1;
+		int mipHeight = imageHeight + 1;
+
+		int fmtWidth = TexDecoder_GetBlockWidthInTexels(ti0.format);
+		int fmtHeight = TexDecoder_GetBlockHeightInTexels(ti0.format);
+		int fmtDepth = TexDecoder_GetTexelSizeInNibbles(ti0.format);
+
+		imageWidth >>= mip;
+		imageHeight >>= mip;
+		s >>= mip;
+		t >>= mip;
+
+		while (mip)
+		{
+			mipWidth = max(mipWidth, fmtWidth);
+			mipHeight = max(mipHeight, fmtHeight);
+			u32 size = (mipWidth * mipHeight * fmtDepth) >> 1;
+
+			imageSrc += size;
+			mipWidth >>= 1;
+			mipHeight >>= 1;
+			mip--;
+		}
+	}
+
+	// integer part of sample location
+	int imageS = s >> 7;
+	int imageT = t >> 7;
 
     if (linear)
     {
-        s32 s256 = s32((s - 0.5f) * 256);
-        s32 t256 = s32((t- 0.5f) * 256);
-
-        int imageS = s256 >> 8;
-        int imageSPlus1 = imageS + 1;
-        u32 fractS = s256 & 0xff;
-        fractS += fractS >> 7;
-
-        int imageT = t256 >> 8;
+        // linear sampling
+		int imageSPlus1 = imageS + 1;
+        int fractS = s & 0x7f;
+        
         int imageTPlus1 = imageT + 1;
-        u32 fractT = t256 & 0xff;
-        fractT += fractT >> 7;
+        int fractT = t & 0x7f;
 
         u8 sampledTex[4];
         u32 texel[4];
 
-        WrapCoord(imageS, tm0.wrap_s, ti0.width);
-        WrapCoord(imageT, tm0.wrap_t, ti0.height);
-        WrapCoord(imageSPlus1, tm0.wrap_s, ti0.width);
-        WrapCoord(imageTPlus1, tm0.wrap_t, ti0.height);
+        WrapCoord(imageS, tm0.wrap_s, imageWidth);
+        WrapCoord(imageT, tm0.wrap_t, imageHeight);
+        WrapCoord(imageSPlus1, tm0.wrap_s, imageWidth);
+        WrapCoord(imageTPlus1, tm0.wrap_t, imageHeight);
 
-        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageS, imageT, ti0.width, ti0.format, texTlut.tmem_offset << 9, texTlut.tlut_format);
-        SetTexel(sampledTex, texel, (256 - fractS) * (256 - fractT));
+        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageS, imageT, imageWidth, ti0.format, tlutAddress, texTlut.tlut_format);
+        SetTexel(sampledTex, texel, (128 - fractS) * (128 - fractT));
 
-        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageSPlus1, imageT, ti0.width, ti0.format, texTlut.tmem_offset << 9, texTlut.tlut_format);
-        AddTexel(sampledTex, texel, (fractS) * (256 - fractT));
+        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageSPlus1, imageT, imageWidth, ti0.format, tlutAddress, texTlut.tlut_format);
+        AddTexel(sampledTex, texel, (fractS) * (128 - fractT));
 
-        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageS, imageTPlus1, ti0.width, ti0.format, texTlut.tmem_offset << 9, texTlut.tlut_format);
-        AddTexel(sampledTex, texel, (256 - fractS) * (fractT));
+        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageS, imageTPlus1, imageWidth, ti0.format, tlutAddress, texTlut.tlut_format);
+        AddTexel(sampledTex, texel, (128 - fractS) * (fractT));
 
-        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageSPlus1, imageTPlus1, ti0.width, ti0.format, texTlut.tmem_offset << 9, texTlut.tlut_format);
+        TexDecoder_DecodeTexel(sampledTex, imageSrc, imageSPlus1, imageTPlus1, imageWidth, ti0.format, tlutAddress, texTlut.tlut_format);
         AddTexel(sampledTex, texel, (fractS) * (fractT));
 
-        sample[0] = (u8)(texel[0] >> 16);
-        sample[1] = (u8)(texel[1] >> 16);
-        sample[2] = (u8)(texel[2] >> 16);
-        sample[3] = (u8)(texel[3] >> 16);
+        sample[0] = (u8)(texel[0] >> 14);
+        sample[1] = (u8)(texel[1] >> 14);
+        sample[2] = (u8)(texel[2] >> 14);
+        sample[3] = (u8)(texel[3] >> 14);
     }
     else
     {
-        int imageS = int(s);
-        int imageT = int(t);
+        // nearest neighbor sampling
+		WrapCoord(imageS, tm0.wrap_s, imageWidth);
+        WrapCoord(imageT, tm0.wrap_t, imageHeight);
 
-        WrapCoord(imageS, tm0.wrap_s, ti0.width);
-        WrapCoord(imageT, tm0.wrap_t, ti0.height);
-
-        TexDecoder_DecodeTexel(sample, imageSrc, imageS, imageT, ti0.width, ti0.format, texTlut.tmem_offset << 9, texTlut.tlut_format);   
+        TexDecoder_DecodeTexel(sample, imageSrc, imageS, imageT, imageWidth, ti0.format, tlutAddress, texTlut.tlut_format);   
     }
 }
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.h b/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.h
index 27d786068d..b456769c92 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.h
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/TextureSampler.h
@@ -23,7 +23,9 @@
 
 namespace TextureSampler
 {
-    void Sample(float s, float t, float lod, u8 texmap, u8 *sample);
+	void Sample(s32 s, s32 t, s32 lod, bool linear, u8 texmap, u8 *sample);
+
+	void SampleMip(s32 s, s32 t, s32 mip, bool linear, u8 texmap, u8 *sample);
 }
 
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/TransformUnit.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/TransformUnit.cpp
index b955d233e9..c614af8627 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/TransformUnit.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/TransformUnit.cpp
@@ -22,6 +22,7 @@
 #include "TransformUnit.h"
 #include "XFMemLoader.h"
 #include "CPMemLoader.h"
+#include "BPMemLoader.h"
 #include "NativeVertexFormat.h"
 
 #include "../../Plugin_VideoDX9/Src/Vec3.h"
@@ -30,48 +31,48 @@
 namespace TransformUnit
 {
 
-void MultiplyVec2Mat24(const float *vec, const float *mat, float *result)
+void MultiplyVec2Mat24(const Vec3 &vec, const float *mat, Vec3 &result)
 {
-    result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] + mat[3];
-    result[1] = mat[4] * vec[0] + mat[5] * vec[1] + mat[6] + mat[7];
+    result.x = mat[0] * vec.x + mat[1] * vec.y + mat[2] + mat[3];
+    result.y = mat[4] * vec.x + mat[5] * vec.y + mat[6] + mat[7];
 }
 
-void MultiplyVec2Mat34(const float *vec, const float *mat, float *result)
+void MultiplyVec2Mat34(const Vec3 &vec, const float *mat, Vec3 &result)
 {
-    result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] + mat[3];
-    result[1] = mat[4] * vec[0] + mat[5] * vec[1] + mat[6] + mat[7];
-    result[2] = mat[8] * vec[0] + mat[9] * vec[1] + mat[10] + mat[11];
+    result.x = mat[0] * vec.x + mat[1] * vec.y + mat[2] + mat[3];
+    result.y = mat[4] * vec.x + mat[5] * vec.y + mat[6] + mat[7];
+    result.z = mat[8] * vec.x + mat[9] * vec.y + mat[10] + mat[11];
 }
 
-void MultiplyVec3Mat33(const float *vec, const float *mat, float *result)
+void MultiplyVec3Mat33(const Vec3 &vec, const float *mat, Vec3 &result)
 {
-    result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] * vec[2];
-    result[1] = mat[3] * vec[0] + mat[4] * vec[1] + mat[5] * vec[2];
-    result[2] = mat[6] * vec[0] + mat[7] * vec[1] + mat[8] * vec[2];
+    result.x = mat[0] * vec.x + mat[1] * vec.y + mat[2] * vec.z;
+    result.y = mat[3] * vec.x + mat[4] * vec.y + mat[5] * vec.z;
+    result.z = mat[6] * vec.x + mat[7] * vec.y + mat[8] * vec.z;
 }
 
-void MultiplyVec3Mat34(const float *vec, const float *mat, float *result)
+void MultiplyVec3Mat34(const Vec3 &vec, const float *mat, Vec3 &result)
 {
-    result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] * vec[2] + mat[3];
-    result[1] = mat[4] * vec[0] + mat[5] * vec[1] + mat[6] * vec[2] + mat[7];
-    result[2] = mat[8] * vec[0] + mat[9] * vec[1] + mat[10] * vec[2] + mat[11];
+    result.x = mat[0] * vec.x + mat[1] * vec.y + mat[2] * vec.z + mat[3];
+    result.y = mat[4] * vec.x + mat[5] * vec.y + mat[6] * vec.z + mat[7];
+    result.z = mat[8] * vec.x + mat[9] * vec.y + mat[10] * vec.z + mat[11];
 }
 
-void MultipleVec3Perspective(const float *vec, const float *proj, float *result)
+void MultipleVec3Perspective(const Vec3 &vec, const float *proj, Vec4 &result)
 {
-    result[0] = proj[0] * vec[0] + proj[1] * vec[2];
-    result[1] = proj[2] * vec[1] + proj[3] * vec[2];
-    //result[2] = (proj[4] * vec[2] + proj[5]);
-    result[2] = (proj[4] * vec[2] + proj[5]) * (1.0f - (float)1e-7);
-    result[3] = -vec[2];
+    result.x = proj[0] * vec.x + proj[1] * vec.z;
+    result.y = proj[2] * vec.y + proj[3] * vec.z;
+    //result.z = (proj[4] * vec.z + proj[5]);
+    result.z = (proj[4] * vec.z + proj[5]) * (1.0f - (float)1e-7);
+    result.w = -vec.z;
 }
 
-void MultipleVec3Ortho(const float *vec, const float *proj, float *result)
+void MultipleVec3Ortho(const Vec3 &vec, const float *proj, Vec4 &result)
 {
-    result[0] = proj[0] * vec[0] + proj[1];
-    result[1] = proj[2] * vec[1] + proj[3];
-    result[2] = proj[4] * vec[2] + proj[5];
-    result[3] = 1;
+    result.x = proj[0] * vec.x + proj[1];
+    result.y = proj[2] * vec.y + proj[3];
+    result.z = proj[4] * vec.z + proj[5];
+    result.w = 1;
 }
 
 void TransformPosition(const InputVertexData *src, OutputVertexData *dst)
@@ -98,55 +99,53 @@ void TransformNormal(const InputVertexData *src, bool nbt, OutputVertexData *dst
         MultiplyVec3Mat33(src->normal[0], mat, dst->normal[0]);
         MultiplyVec3Mat33(src->normal[1], mat, dst->normal[1]);
         MultiplyVec3Mat33(src->normal[2], mat, dst->normal[2]);
-        Vec3 *norm0 = (Vec3*)dst->normal[0];
-        norm0->normalize();
+        dst->normal[0].normalize();
     }
     else
     {
         MultiplyVec3Mat33(src->normal[0], mat, dst->normal[0]);
-        Vec3 *norm0 = (Vec3*)dst->normal[0];
-        norm0->normalize();
+        dst->normal[0].normalize();
     }    
 }
 
 inline void TransformTexCoordRegular(const TexMtxInfo &texinfo, int coordNum, bool specialCase, const InputVertexData *srcVertex, OutputVertexData *dstVertex)
 {
-    const float *src;
+    const Vec3 *src;
     switch (texinfo.sourcerow)
     {
         case XF_SRCGEOM_INROW:
-            src = srcVertex->position;
+            src = &srcVertex->position;
             break;
         case XF_SRCNORMAL_INROW:
-            src = srcVertex->normal[0];
+            src = &srcVertex->normal[0];
             break;
         case XF_SRCBINORMAL_T_INROW:
-            src = srcVertex->normal[1];
+            src = &srcVertex->normal[1];
             break;
         case XF_SRCBINORMAL_B_INROW:
-            src = srcVertex->normal[2];
+            src = &srcVertex->normal[2];
             break;
         default:
             _assert_(texinfo.sourcerow >= XF_SRCTEX0_INROW && texinfo.sourcerow <= XF_SRCTEX7_INROW);
-            src = srcVertex->texCoords[texinfo.sourcerow - XF_SRCTEX0_INROW];
+            src = (Vec3*)srcVertex->texCoords[texinfo.sourcerow - XF_SRCTEX0_INROW];
             break;
     }
 
     const float *mat = (const float*)&xfregs.posMatrices[srcVertex->texMtx[coordNum] * 4];
-    float *dst = dstVertex->texCoords[coordNum];
+    Vec3 *dst = &dstVertex->texCoords[coordNum];
 
     if (texinfo.inputform == XF_TEXINPUT_AB11)
     {
-        MultiplyVec2Mat34(src, mat, dst); 
+        MultiplyVec2Mat34(*src, mat, *dst); 
     }
     else
     {
-        MultiplyVec3Mat34(src, mat, dst); 
+        MultiplyVec3Mat34(*src, mat, *dst); 
     }
 
     if (xfregs.dualTexTrans)
     {
-        float tempCoord[3];
+        Vec3 tempCoord;
 
         // normalize
         const PostMtxInfo &postInfo = xfregs.postMtxInfo[coordNum];
@@ -157,12 +156,12 @@ inline void TransformTexCoordRegular(const TexMtxInfo &texinfo, int coordNum, bo
 			// no normalization
 			// q of input is 1
 			// q of output is unknown
-			tempCoord[0] = dst[0];
-			tempCoord[1] = dst[1];
+			tempCoord.x = dst->x;
+			tempCoord.y = dst->y;
 
-			dst[0] = postMat[0] * tempCoord[0] + postMat[1] * tempCoord[1] + postMat[2] + postMat[3];
-			dst[1] = postMat[4] * tempCoord[0] + postMat[5] * tempCoord[1] + postMat[6] + postMat[7];
-			dst[2] = 0.0f;
+			dst->x = postMat[0] * tempCoord.x + postMat[1] * tempCoord.y + postMat[2] + postMat[3];
+			dst->y = postMat[4] * tempCoord.x + postMat[5] * tempCoord.y + postMat[6] + postMat[7];
+			dst->z = 1.0f;
 		}
 		else
 		{		
@@ -170,18 +169,14 @@ inline void TransformTexCoordRegular(const TexMtxInfo &texinfo, int coordNum, bo
 			{
 				float length = sqrtf(dst[0] * dst[0] + dst[1] * dst[1] + dst[2] * dst[2]);
 				float invL = 1.0f / length;
-				tempCoord[0] = invL * dst[0];
-				tempCoord[1] = invL * dst[1];
-				tempCoord[2] = invL * dst[2];
+				tempCoord = *dst * invL;
 			}
 			else
 			{
-				tempCoord[0] = dst[0];
-				tempCoord[1] = dst[1];
-				tempCoord[2] = dst[2];
+				tempCoord = *dst;
 			}
 
-			MultiplyVec3Mat34(tempCoord, postMat, dst);
+			MultiplyVec3Mat34(tempCoord, postMat, *dst);
 		}
     }
 }
@@ -220,13 +215,8 @@ inline float SafeDivide(float n, float d)
     return (d==0)?(n>0?1:0):n/d;
 }
 
-void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const LitChannel &chan, Vec3 &lightCol)
+void LightColor(const Vec3 &pos, const Vec3 &normal, u8 lightNum, const LitChannel &chan, Vec3 &lightCol)
 {
-    // must be the size of 3 32bit floats for the light pointer to be valid
-    _assert_(sizeof(Vec3) == 12);
-
-    const Vec3 *pos = (const Vec3*)vertexPos;
-    const Vec3 *norm0 = (const Vec3*)normal;
     const LightPointer *light = (const LightPointer*)&xfregs.lights[0x10*lightNum];
 
     if (!(chan.attnfunc & 1)) {
@@ -237,15 +227,15 @@ void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const
                 break;
             case LIGHTDIF_SIGN:
                 {
-                    Vec3 ldir = (light->pos - *pos).normalized();
-                    float diffuse = ldir * (*norm0);
+                    Vec3 ldir = (light->pos - pos).normalized();
+                    float diffuse = ldir * normal;
                     AddScaledIntegerColor(light->color, diffuse, lightCol);
                 }
                 break;
             case LIGHTDIF_CLAMP:
                 {
-                    Vec3 ldir = (light->pos - *pos).normalized();
-                    float diffuse = max(0.0f, ldir * (*norm0));
+                    Vec3 ldir = (light->pos - pos).normalized();
+                    float diffuse = max(0.0f, ldir * normal);
                     AddScaledIntegerColor(light->color, diffuse, lightCol);
                 }
                 break;
@@ -254,7 +244,7 @@ void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const
     }
     else { // spec and spot
         // not sure about divide by zero checks
-        Vec3 ldir = light->pos - *pos;
+        Vec3 ldir = light->pos - pos;
         float attn;
 
         if (chan.attnfunc == 3) { // spot
@@ -269,7 +259,7 @@ void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const
         }
         else if (chan.attnfunc == 1) { // specular
             // donko - what is going on here?  655.36 is a guess but seems about right.
-            attn = (light->pos * (*norm0)) > -655.36 ? max(0.0f, (light->dir * (*norm0))) : 0;
+            attn = (light->pos * normal) > -655.36 ? max(0.0f, (light->dir * normal)) : 0;
             ldir.set(1.0f, attn, attn * attn);
 
             float cosAtt = max(0.0f, light->cosatt * ldir);
@@ -283,14 +273,14 @@ void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const
                 break;
             case LIGHTDIF_SIGN:
                 {
-                    float difAttn = ldir * (*norm0);
+                    float difAttn = ldir * normal;
                     AddScaledIntegerColor(light->color, attn * difAttn, lightCol);
                 }
                 break;
 
             case LIGHTDIF_CLAMP:
                 {
-                    float difAttn = max(0.0f, ldir * (*norm0));
+                    float difAttn = max(0.0f, ldir * normal);
                     AddScaledIntegerColor(light->color, attn * difAttn, lightCol);
                 }
                 break;
@@ -299,13 +289,8 @@ void LightColor(const float *vertexPos, const float *normal, u8 lightNum, const
     }
 }
 
-void LightAlpha(const float *vertexPos, const float *normal, u8 lightNum, const LitChannel &chan, float &lightCol)
+void LightAlpha(const Vec3 &pos, const Vec3 &normal, u8 lightNum, const LitChannel &chan, float &lightCol)
 {
-    // must be the size of 3 32bit floats for the light pointer to be valid
-    _assert_(sizeof(Vec3) == 12);
-
-    const Vec3 *pos = (const Vec3*)vertexPos;
-    const Vec3 *norm0 = (const Vec3*)normal;
     const LightPointer *light = (const LightPointer*)&xfregs.lights[0x10*lightNum];
 
     if (!(chan.attnfunc & 1)) {
@@ -316,15 +301,15 @@ void LightAlpha(const float *vertexPos, const float *normal, u8 lightNum, const
                 break;
             case LIGHTDIF_SIGN:
                 {
-                    Vec3 ldir = (light->pos - *pos).normalized();                    
-                    float diffuse = ldir * (*norm0);
+                    Vec3 ldir = (light->pos - pos).normalized();                    
+                    float diffuse = ldir * normal;
                     lightCol += light->color[0] * diffuse;
                 }
                 break;
             case LIGHTDIF_CLAMP:
                 {
-                    Vec3 ldir = (light->pos - *pos).normalized();
-                    float diffuse = max(0.0f, ldir * (*norm0));
+                    Vec3 ldir = (light->pos - pos).normalized();
+                    float diffuse = max(0.0f, ldir * normal);
                     lightCol += light->color[0] * diffuse;
                 }
                 break;
@@ -332,7 +317,7 @@ void LightAlpha(const float *vertexPos, const float *normal, u8 lightNum, const
         }
     }
     else { // spec and spot
-        Vec3 ldir = light->pos - *pos;
+        Vec3 ldir = light->pos - pos;
         float attn;
 
         if (chan.attnfunc == 3) { // spot
@@ -347,7 +332,7 @@ void LightAlpha(const float *vertexPos, const float *normal, u8 lightNum, const
         }
         else if (chan.attnfunc == 1) { // specular
             // donko - what is going on here?  655.36 is a guess but seems about right.
-            attn = (light->pos * (*norm0)) > -655.36 ? max(0.0f, (light->dir * (*norm0))) : 0;
+            attn = (light->pos * normal) > -655.36 ? max(0.0f, (light->dir * normal)) : 0;
             ldir.set(1.0f, attn, attn * attn);
 
             float cosAtt = light->cosatt * ldir;
@@ -361,14 +346,14 @@ void LightAlpha(const float *vertexPos, const float *normal, u8 lightNum, const
                 break;
             case LIGHTDIF_SIGN:
                 {
-                    float difAttn = ldir * (*norm0);
+                    float difAttn = ldir * normal;
                     lightCol += light->color[0] * attn * difAttn;
                 }
                 break;
 
             case LIGHTDIF_CLAMP:
                 {
-                    float difAttn = max(0.0f, ldir * (*norm0));
+                    float difAttn = max(0.0f, ldir * normal);
                     lightCol += light->color[0] * attn * difAttn;
                 }
                 break;
@@ -472,14 +457,11 @@ void TransformTexCoord(const InputVertexData *src, OutputVertexData *dst, bool s
             break;
         case XF_TEXGEN_EMBOSS_MAP:
             {
-                const Vec3 *pos = (const Vec3*)dst->mvPosition;
-                const Vec3 *norm1 = (const Vec3*)dst->normal[1];
-                const Vec3 *norm2 = (const Vec3*)dst->normal[2];
                 const LightPointer *light = (const LightPointer*)&xfregs.lights[0x10*texinfo.embosslightshift];
 
-                Vec3 ldir = (light->pos - *pos).normalized();
-                float d1 = ldir * (*norm1);
-                float d2 = ldir * (*norm2);
+                Vec3 ldir = (light->pos - dst->mvPosition).normalized();
+                float d1 = ldir * dst->normal[1];
+                float d2 = ldir * dst->normal[2];
 
                 dst->texCoords[coordNum][0] = dst->texCoords[texinfo.embosssourceshift][0] + d1;
                 dst->texCoords[coordNum][1] = dst->texCoords[texinfo.embosssourceshift][1] + d2;
@@ -503,6 +485,9 @@ void TransformTexCoord(const InputVertexData *src, OutputVertexData *dst, bool s
         default:
             ERROR_LOG(VIDEO, "Bad tex gen type %i", texinfo.texgentype);            
         }
+
+		dst->texCoords[coordNum][0] *= (bpmem.texcoords[coordNum].s.scale_minus_1 + 1);
+		dst->texCoords[coordNum][1] *= (bpmem.texcoords[coordNum].t.scale_minus_1 + 1);
     }
 }
 
diff --git a/Source/Plugins/Plugin_VideoSoftware/Src/VertexFormatConverter.cpp b/Source/Plugins/Plugin_VideoSoftware/Src/VertexFormatConverter.cpp
index d3d3f421c9..f50220aaad 100644
--- a/Source/Plugins/Plugin_VideoSoftware/Src/VertexFormatConverter.cpp
+++ b/Source/Plugins/Plugin_VideoSoftware/Src/VertexFormatConverter.cpp
@@ -24,32 +24,32 @@ namespace VertexFormatConverter
 {
     void LoadNormal1_Byte(InputVertexData *dst, u8 *src)
     {
-        dst->normal[0][0] = (float)(s8)src[0] / 128;
-        dst->normal[0][1] = (float)(s8)src[1] / 128;
-        dst->normal[0][2] = (float)(s8)src[2] / 128;
+        dst->normal[0].x = (float)(s8)src[0] / 128;
+        dst->normal[0].y = (float)(s8)src[1] / 128;
+        dst->normal[0].z = (float)(s8)src[2] / 128;
     }
 
     void LoadNormal1_Short(InputVertexData *dst, u8 *src)
     {
-        dst->normal[0][0] = (float)((s16*)src)[0] / 32768;
-        dst->normal[0][1] = (float)((s16*)src)[1] / 32768;
-        dst->normal[0][2] = (float)((s16*)src)[2] / 32768;
+        dst->normal[0].x = (float)((s16*)src)[0] / 32768;
+        dst->normal[0].y = (float)((s16*)src)[1] / 32768;
+        dst->normal[0].z = (float)((s16*)src)[2] / 32768;
     }
 
     void LoadNormal1_Float(InputVertexData *dst, u8 *src)
     {
-        dst->normal[0][0] = ((float*)src)[0];
-        dst->normal[0][1] = ((float*)src)[1];
-        dst->normal[0][2] = ((float*)src)[2];
+        dst->normal[0].x = ((float*)src)[0];
+        dst->normal[0].y = ((float*)src)[1];
+        dst->normal[0].z = ((float*)src)[2];
     }
 
     void LoadNormal3_Byte(InputVertexData *dst, u8 *src)
     {
         for (int i = 0, j = 0; i < 3; i++, j+=3)
         {
-            dst->normal[i][0] = (float)(s8)src[j + 0] / 128;
-            dst->normal[i][1] = (float)(s8)src[j + 1] / 128;
-            dst->normal[i][2] = (float)(s8)src[j + 2] / 128;
+            dst->normal[i].x = (float)(s8)src[j + 0] / 128;
+            dst->normal[i].y = (float)(s8)src[j + 1] / 128;
+            dst->normal[i].z = (float)(s8)src[j + 2] / 128;
         }
     }
 
@@ -57,9 +57,9 @@ namespace VertexFormatConverter
     {
         for (int i = 0, j = 0; i < 3; i++, j+=3)
         {
-            dst->normal[i][0] = (float)((s16*)src)[j + 0] / 32768;
-            dst->normal[i][1] = (float)((s16*)src)[j + 1] / 32768;
-            dst->normal[i][2] = (float)((s16*)src)[j + 2] / 32768;
+            dst->normal[i].x = (float)((s16*)src)[j + 0] / 32768;
+            dst->normal[i].y = (float)((s16*)src)[j + 1] / 32768;
+            dst->normal[i].z = (float)((s16*)src)[j + 2] / 32768;
         }
     }
 
@@ -67,9 +67,9 @@ namespace VertexFormatConverter
     {
         for (int i = 0, j = 0; i < 3; i++, j+=3)
         {
-            dst->normal[i][0] = ((float*)src)[j + 0];
-            dst->normal[i][1] = ((float*)src)[j + 1];
-            dst->normal[i][2] = ((float*)src)[j + 2];
+            dst->normal[i].x = ((float*)src)[j + 0];
+            dst->normal[i].y = ((float*)src)[j + 1];
+            dst->normal[i].z = ((float*)src)[j + 2];
         }
     }
 }